diff options
author | George Hazan <george.hazan@gmail.com> | 2015-03-15 15:30:03 +0000 |
---|---|---|
committer | George Hazan <george.hazan@gmail.com> | 2015-03-15 15:30:03 +0000 |
commit | b81bd804e435e76592f9f281b717c696c3618fa2 (patch) | |
tree | 57c9295922457227265ba59e6f3f507ad038e194 /plugins/Dbx_kv/src/hamsterdb | |
parent | 641c67a9d552b07664308c2ae3384cc95a75a2d0 (diff) |
initial release
git-svn-id: http://svn.miranda-ng.org/main/trunk@12409 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c
Diffstat (limited to 'plugins/Dbx_kv/src/hamsterdb')
139 files changed, 41937 insertions, 0 deletions
diff --git a/plugins/Dbx_kv/src/hamsterdb/AUTHORS b/plugins/Dbx_kv/src/hamsterdb/AUTHORS new file mode 100644 index 0000000000..6b0d2a235f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/AUTHORS @@ -0,0 +1,6 @@ + +Ger Hobbelt (http://www.hobbelt.com, http://www.hebbut.net - THANKS!) + ham_env_get_parameters, ham_db_get_parameters and functions for approximate + matching, minor bugfixes and performance improvements plus documentation + fixes/improvements; a complete rewrite of the freelist code with HUGE + performance gains - THANKS! diff --git a/plugins/Dbx_kv/src/hamsterdb/COPYING b/plugins/Dbx_kv/src/hamsterdb/COPYING new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/COPYING @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/plugins/Dbx_kv/src/hamsterdb/CREDITS b/plugins/Dbx_kv/src/hamsterdb/CREDITS new file mode 100644 index 0000000000..d2571aeebd --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/CREDITS @@ -0,0 +1,6 @@ + +Jul 20, 2009 +ham_env_get_parameters, ham_db_get_parameters and functions for approximate +matching, minor bugfixes and performance improvements plus documentation +improvements were written by Ger Hobbelt, http://www.hobbelt.com, +http://www.hebbut.net - THANKS! diff --git a/plugins/Dbx_kv/src/hamsterdb/NEWS b/plugins/Dbx_kv/src/hamsterdb/NEWS new file mode 100644 index 0000000000..da7acb2eb3 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/NEWS @@ -0,0 +1 @@ +See http://hamsterdb.com for up-to-date news about the project. diff --git a/plugins/Dbx_kv/src/hamsterdb/README b/plugins/Dbx_kv/src/hamsterdb/README new file mode 100644 index 0000000000..66fba73f22 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/README @@ -0,0 +1,261 @@ +hamsterdb 2.1.10 Mo 23. Feb 23:07:52 CET 2015 +(C) Christoph Rupp, chris@crupp.de; http://www.hamsterdb.com + +This is the README file of hamsterdb. + +Contents: + +1. About + +hamsterdb is a database engine written in C/C++. It is fast, production-proven +and easy to use. + +This release has several bug fixes (see below for a list). Thanks to those who +reported them and invested lots of time to come up with samples to reproduce +the bugs. + +The flag HAM_RECORD_NUMBER is deprecated. It is replaced with +HAM_RECORD_NUMBER64 for 64bit record numbers, and HAM_RECORD_NUMBER32 for +32bit record numbers. + +A major change under the hood: dirty pages are now flushed asynchronously. +Expect performance improvements in this release, and more to come in the +next releases. + +2. Changes + +New Features +* Added Cursor.TryFind to hamsterdb-dotnet + (thanks, mjmckp <matthew.j.m.peacock@gmail.com>) +* The page cache eviction was moved to a background thread +* When reading records from mmapped storage, a pointer into the storage + is returned and the record data is no longer copied + +Bugfixes +* Fixed FreeBSD compilation errors (thanks, Heping Wen) +* issue #46: fixed segfault in approx. matching (thanks, Joel + Jacobson) +* issue #45: fixed segfault in Journal recovery (thanks, Michael + Moellney) +* issue #44: approx. matching returned the wrong key (thanks, Joel + Jacobson) +* issue #43: fixed segfault when flushing transactions (thanks, Joel + Jacobson) +* Fixed compilation error on debian Wheezy, gcc 4.7.2, 32bit (thanks, + Thomas Fähnle) +* Fixed compilation error on OSX (thanks, Daniel Lemire) +* issue #42: ham_cursor_find returned wrong key w/ approx. matching and + transactions +* Fixed large file support on linux (thanks, Thomas Fähnle) + +Other Changes +* Default compilation flag is now -O3 +* Added a new parameter HAM_PARAM_POSIX_FADVISE (thanks, Thomas Fähnle) +* Removed dependency to malloc.h +* The github wiki is now linked into documentation/wiki +* The macro HAM_API_REVISION is now deprecated; use HAM_VERSION_* instead +* Deprecated HAM_RECORD_NUMBER (use HAM_RECORD_NUMBER64 instead); + introduced a new flag HAM_RECORD_NUMBER32 for 32bit record numbers +* Implemented ham_cursor_get_record_size() for remote access + +To see a list of all changes, look in the file ChangeLog. + +3. Roadmap +- See https://github.com/cruppstahl/hamsterdb/wiki/Roadmap + +4. Features + +- PRO: SIMD instructions for lookups +- PRO: transparent AES encryption +- PRO: transparent CRC32 verification +- PRO: transparent compression for journal, keys and records using + zlib, snappy, lzf or lzo +- PRO: compression for uint32 keys + +- Very fast sorted B+Tree with variable length keys +- Basic schema support for POD types (i.e. uint32, uint64, real32 etc) +- Very fast analytical functions +- Can run as an in-memory database +- Multiple databases in one file +- Record number databases ("auto-increment") +- Duplicate keys +- Logging and recovery +- Unlimited number of parallel Transactions +- Partial reading/writing of records +- Network access (remote databases) via TCP/Protocol Buffers +- Very fast database cursors +- Configurable page size, cache size, key size etc +- Runs on Linux, Unices, Microsoft Windows and other architectures +- Uses memory mapped I/O for fast disk access (but falls back to read/write if + mmap is not available) +- Uses 64bit file pointers and supports huge files (>2 GB) +- Easy to use and well-documented +- Open source and released under APL 2.0 license +- Wrappers for C++, Java, .NET, Erlang, Python, Ada and others + +5. Known Issues/Bugs + +None. + +6. Compiling + +6.1 Linux, MacOS and other Unix systems + +To compile hamsterdb, run ./configure, make, make install. + +Run `./configure --help' for more options (i.e. static/dynamic library, +build with debugging symbols etc). + +6.2 Microsoft Visual Studio 8 + +A Solution file is provided for Microsoft Visual C++ in the "win32" folder +for MSVC 2008 and MSVC 2010. +All libraries can be downloaded precompiled from the hamsterdb webpage. + +To download Microsoft Visual Studio Express Edition for free, go to +http://msdn.microsoft.com/vstudio/express/visualc/default.aspx. + +6.3 Dependencies + +On Ubuntu, the following packages are required: + - libdb-dev (optional) + - protobuf-compiler + - libprotobuf-dev + - libgoogle-perftools-dev + - libboost-system-dev + - libboost-thread-dev + - libboost-dev + - (libuv needs to be installed from sources - see + https://github.com/joyent/libuv) + +For Windows, precompiled dependencies are available here: +https://github.com/cruppstahl/hamsterdb-alien + +7. Testing and Example Code + +Make automatically compiles several example programs in the directory +'samples'. To see hamsterdb in action, just run 'samples/db1' +or any other sample. (or 'win32/out/samples/db1/db1.exe' on Windows platforms). + +8. API Documentation + +The header files in 'include/ham' have extensive comments. Also, a doxygen +script is available; run 'make doc' to start doxygen. The generated +documentation is also available on the hamsterdb web page. + +9. Other Ways to Compile hamsterdb + +If you want to compile hamsterdb without using the provided ./configure +environment, you have to set some preprocessor macros: + +DEBUG enable debugging output and diagnostic checks (slow!) +HAM_32BIT compile for 32bit (alias: WIN32) +HAM_64BIT compile for 64bit (alias: WIN64, also needs WIN32) + +Also, if you compile for windows, you have to compile the file +'src/os_win32.cc' and ignore the file 'src/os_posix.cc'. Vice versa on +non-Windows platforms. + +10. Porting hamsterdb + +Porting hamsterdb shouldn't be too difficult. All operating +system dependend functions are declared in 'src/os.h' and defined +in 'src/os_win32.cc' or 'src/os_posix.cc'. +Other compiler- and OS-specific macros are in 'include/ham/types.h'. +Most likely, these are the only files which have to be touched. Also see item +9) for important macros. + +11. Migrating files from older versions + +Usually, hamsterdb releases are backwards compatible. There are some exceptions, +though. In this case tools are provided to migrate the database. First, export +your existing database with ham_export linked against the old version. +(ham_export links statically and will NOT be confused if your system has a +newer version of hamsterdb installed). Then use the newest version of +ham_import to import the data into a new database. You can find ham_export +and ham_import in the "tools" subdirectory. + + Example (ham_export of 2.1.2 was renamed to ham_export-2.1.2 to document + that it's an older version): + + ham_export-2.1.2 input.db | ham_import --stdin output.db + +12. Licensing + +hamsterdb is released under the APL 2.0 license, which allows +unrestricted use for commercial and non-commercial applications. See the +file COPYING for more information. + +A commercial, closed source version hamsterdb pro with additional functionality +is available on request. See http://hamsterdb.com for more information. + +13. Contact + +Author of hamsterdb is + Christoph Rupp + Paul-Preuss-Str. 63 + 80995 Muenchen/Germany + email: chris@crupp.de + web: http://www.hamsterdb.com + +14. Other Copyrights + +The Google Protocol Buffers ("protobuf") library is Copyright 2008, Google Inc. +It has the following license: + + Copyright 2008, Google Inc. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Code generated by the Protocol Buffer compiler is owned by the owner + of the input file used when generating it. This code is not + standalone and requires a support library to be linked with it. This + support library is itself covered by the above license. + +The libuv library is part of the Node project: http://nodejs.org/ +libuv may be distributed alone under Node's license: + + Copyright Joyent, Inc. and other Node contributors. All rights reserved. + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to + deal in the Software without restriction, including without limitation the + rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + sell copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + IN THE SOFTWARE. + diff --git a/plugins/Dbx_kv/src/hamsterdb/config.h b/plugins/Dbx_kv/src/hamsterdb/config.h new file mode 100644 index 0000000000..d1fbc4d2f5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/config.h @@ -0,0 +1,10 @@ +#define _CRT_SECURE_NO_WARNINGS + +#define HAM_EXPORT + +#define BOOST_SYSTEM_NO_DEPRECATED + +#define HAVE_MMAP 1 +#define HAVE_UNMMAP 1 + +#pragma warning(disable:4100 4127 4512)
\ No newline at end of file diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb.h b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb.h new file mode 100644 index 0000000000..668cfc7cde --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb.h @@ -0,0 +1,2535 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hamsterdb.h + * @brief Include file for hamsterdb Embedded Storage + * @author Christoph Rupp, chris@crupp.de + * @version 2.1.10 + * + * @mainpage + * + * This manual documents the hamsterdb C API. hamsterdb is a key/value database + * that is linked directly into your application, avoiding all the overhead + * that is related to external databases and RDBMS systems. + * + * This header file declares all functions and macros that are needed to use + * hamsterdb. The comments are formatted in Doxygen style and can be extracted + * to automagically generate documentation. The documentation is also available + * online here: <a href="http://hamsterdb.com/public/scripts/html_www"> + http://hamsterdb.com/public/scripts/html_www</a>. + * + * In addition, there's a tutorial book hosted on github: + * <a href="http://github.com/cruppstahl/hamsterdb/wiki/Tutorial"> + http://github.com/cruppstahl/hamsterdb/wiki/Tutorial</a>. + * + * If you want to create or open Databases or Environments (a collection of + * multiple Databases), the following functions will be interesting for you: + * <table> + * <tr><td>@ref ham_env_create</td><td>Creates an Environment</td></tr> + * <tr><td>@ref ham_env_open</td><td>Opens an Environment</td></tr> + * <tr><td>@ref ham_env_close</td><td>Closes an Environment</td></tr> + * <tr><td>@ref ham_env_create_db</td><td>Creates a Database in an + Environment</td></tr> + * <tr><td>@ref ham_env_open_db</td><td>Opens a Database from an + Environment</td></tr> + * <tr><td>@ref ham_db_close</td><td>Closes a Database</td></tr> + * </table> + * + * To insert, lookup or delete key/value pairs, the following functions are + * used: + * <table> + * <tr><td>@ref ham_db_insert</td><td>Inserts a key/value pair into a + Database</td></tr> + * <tr><td>@ref ham_db_find</td><td>Lookup of a key/value pair in a + Database</td></tr> + * <tr><td>@ref ham_db_erase</td><td>Erases a key/value pair from a + Database</td></tr> + * </table> + * + * Alternatively, you can use Cursors to iterate over a Database: + * <table> + * <tr><td>@ref ham_cursor_create</td><td>Creates a new Cursor</td></tr> + * <tr><td>@ref ham_cursor_find</td><td>Positions the Cursor on a key</td></tr> + * <tr><td>@ref ham_cursor_insert</td><td>Inserts a new key/value pair with a + Cursor</td></tr> + * <tr><td>@ref ham_cursor_erase</td><td>Deletes the key/value pair that + the Cursor points to</td></tr> + * <tr><td>@ref ham_cursor_overwrite</td><td>Overwrites the value of the current key</td></tr> + * <tr><td>@ref ham_cursor_move</td><td>Moves the Cursor to the first, next, + previous or last key in the Database</td></tr> + * <tr><td>@ref ham_cursor_close</td><td>Closes the Cursor</td></tr> + * </table> + * + * If you want to use Transactions, then the following functions are required: + * <table> + * <tr><td>@ref ham_txn_begin</td><td>Begins a new Transaction</td></tr> + * <tr><td>@ref ham_txn_commit</td><td>Commits the current + Transaction</td></tr> + * <tr><td>@ref ham_txn_abort</td><td>Aborts the current Transaction</td></tr> + * </table> + * + * hamsterdb supports remote Databases. The server can be embedded + * into your application or run standalone (see tools/hamzilla for a Unix + * daemon or Win32 service which hosts Databases). If you want to embed the + * server then the following functions have to be used: + * <table> + * <tr><td>@ref ham_srv_init</td><td>Initializes the server</td></tr> + * <tr><td>@ref ham_srv_add_env</td><td>Adds an Environment to the + server. The Environment with all its Databases will then be available + remotely.</td></tr> + * <tr><td>@ref ham_srv_close</td><td>Closes the server and frees all allocated + resources</td></tr> + * </table> + * + * If you need help then you're always welcome to use the <a + href="https://groups.google.com/forum/?fromgroups#!forum/hamsterdb-user"> + mailing list</a>, + * drop a message (chris at crupp dot de) or use the <a + href="http://hamsterdb.com/index/contact">contact form</a>. + * + * Have fun! + */ + +#ifndef HAM_HAMSTERDB_H +#define HAM_HAMSTERDB_H + +#include <ham/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* deprecated */ +#define HAM_API_REVISION 3 + +/** + * The version numbers + * + * @remark A change of the major revision means a significant update + * with a lot of new features and API changes. + * + * The minor version means a significant update without API changes, and the + * revision is incremented for each release with minor improvements only. + * + * The file version describes the version of the binary database format. + * hamsterdb is neither backwards- nor forwards-compatible regarding file + * format changes. + * + * If a file was created with hamsterdb pro then the msb of the file version + * is set. hamsterdb pro is able to open files created with hamsterdb (APL + * version), but not vice versa. + * + * History of file versions: + * 2.1.0: introduced the file version; version is 0 + * 2.1.3: new btree format, file format cleanups; version is 1 + * 2.1.4: new btree format for duplicate keys/var. length keys; version is 2 + * 2.1.5: new freelist; version is 3 + * 2.1.10: changes in btree node format; version is 4 + */ +#define HAM_VERSION_MAJ 2 +#define HAM_VERSION_MIN 1 +#define HAM_VERSION_REV 10 +#define HAM_FILE_VERSION 4 + +/** + * The hamsterdb Database structure + * + * This structure is allocated in @ref ham_env_create_db and + * @ref ham_env_open_db. It is deleted in @a ham_db_close. + */ +struct ham_db_t; +typedef struct ham_db_t ham_db_t; + +/** + * The hamsterdb Environment structure + * + * This structure is allocated with @ref ham_env_create and @ref ham_env_open + * and is deleted in @ref ham_env_close. + */ +struct ham_env_t; +typedef struct ham_env_t ham_env_t; + +/** + * A Database Cursor + * + * A Cursor is used for bi-directionally traversing the Database and + * for inserting/deleting/searching Database items. + * + * This structure is allocated with @ref ham_cursor_create and deleted with + * @ref ham_cursor_close. + */ +struct ham_cursor_t; +typedef struct ham_cursor_t ham_cursor_t; + +/** + * A generic record. + * + * A record represents data items in hamsterdb. Before using a record, it + * is important to initialize all record fields with zeroes, i.e. with + * the C library routines memset(3) or bzero(2). + * + * When hamsterdb returns a record structure, the pointer to the record + * data is provided in @a data. This pointer is only temporary and will be + * overwritten by subsequent hamsterdb API calls using the same Transaction + * (or, if Transactions are disabled, using the same Database). The pointer + * will also be invalidated after the Transaction is aborted or committed. + * + * To avoid this, the calling application can allocate the @a data pointer. + * In this case, you have to set the flag @ref HAM_RECORD_USER_ALLOC. The + * @a size parameter will then return the size of the record. It's the + * responsibility of the caller to make sure that the @a data parameter is + * large enough for the record. + * + * The record->data pointer is not threadsafe. For threadsafe access it is + * recommended to use @a HAM_RECORD_USER_ALLOC or have each thread manage its + * own Transaction. + */ +typedef struct { + /** The size of the record data, in bytes */ + uint32_t size; + + /** Pointer to the record data */ + void *data; + + /** The record flags; see @ref HAM_RECORD_USER_ALLOC */ + uint32_t flags; + + /** Offset for partial reading/writing; see @ref HAM_PARTIAL */ + uint32_t partial_offset; + + /** Size for partial reading/writing; see @ref HAM_PARTIAL */ + uint32_t partial_size; + +} ham_record_t; + +/** Flag for @ref ham_record_t (only really useful in combination with + * @ref ham_cursor_move, @ref ham_cursor_find and @ref ham_db_find) + */ +#define HAM_RECORD_USER_ALLOC 1 + +/** + * A macro to statically initialize a @ref ham_record_t structure. + * + * Usage: + * ham_record_t rec = ham_make_record(ptr, size); + */ +#define ham_make_record(PTR, SIZE) { SIZE, PTR, 0 } + +/** + * A generic key. + * + * A key represents key items in hamsterdb. Before using a key, it + * is important to initialize all key fields with zeroes, i.e. with + * the C library routines memset(3) or bzero(2). + * + * hamsterdb usually uses keys to insert, delete or search for items. + * However, when using Database Cursors and the function @ref ham_cursor_move, + * hamsterdb also returns keys. In this case, the pointer to the key + * data is provided in @a data. This pointer is only temporary and will be + * overwritten by subsequent calls to @ref ham_cursor_move using the + * same Transaction (or, if Transactions are disabled, using the same Database). + * The pointer will also be invalidated after the Transaction is aborted + * or committed. + * + * To avoid this, the calling application can allocate the @a data pointer. + * In this case, you have to set the flag @ref HAM_KEY_USER_ALLOC. The + * @a size parameter will then return the size of the key. It's the + * responsibility of the caller to make sure that the @a data parameter is + * large enough for the key. + * + * The key->data pointer is not threadsafe. For threadsafe access it is + * recommended to use @a HAM_KEY_USER_ALLOC or have each thread manage its + * own Transaction. + */ +typedef struct { + /** The size of the key, in bytes */ + uint16_t size; + + /** The data of the key */ + void *data; + + /** The key flags; see @ref HAM_KEY_USER_ALLOC */ + uint32_t flags; + + /** For internal use */ + uint32_t _flags; + +} ham_key_t; + +/** + * A macro to statically initialize a @ref ham_key_t structure. + * + * Usage: + * ham_key_t key = ham_make_key(ptr, size); + */ +#define ham_make_key(PTR, SIZE) { SIZE, PTR, 0 } + +/** Flag for @ref ham_key_t (only really useful in combination with + * @ref ham_cursor_move, @ref ham_cursor_find and @ref ham_db_find) + */ +#define HAM_KEY_USER_ALLOC 1 + +/** + * A named parameter. + * + * These parameter structures are used for functions like @ref ham_env_open, + * @ref ham_env_create, etc. to pass variable length parameter lists. + * + * The lists are always arrays of type ham_parameter_t, with a terminating + * element of { 0, NULL}, e.g. + * + * <pre> + * ham_parameter_t parameters[] = { + * { HAM_PARAM_CACHE_SIZE, 2 * 1024 * 1024 }, // set cache size to 2 mb + * { HAM_PARAM_PAGE_SIZE, 4096 }, // set page size to 4 kb + * { 0, NULL } + * }; + * </pre> + */ +typedef struct { + /** The name of the parameter; all HAM_PARAM_*-constants */ + uint32_t name; + + /** The value of the parameter. */ + uint64_t value; + +} ham_parameter_t; + + +/** + * @defgroup ham_key_types hamsterdb Key Types + * @{ + */ + +/** A binary blob without type; sorted by memcmp */ +#define HAM_TYPE_BINARY 0 +/** A binary blob without type; sorted by callback function */ +#define HAM_TYPE_CUSTOM 1 +/** An unsigned 8-bit integer */ +#define HAM_TYPE_UINT8 3 +/** An unsigned 16-bit integer */ +#define HAM_TYPE_UINT16 5 +/** An unsigned 32-bit integer */ +#define HAM_TYPE_UINT32 7 +/** An unsigned 64-bit integer */ +#define HAM_TYPE_UINT64 9 +/** An 32-bit float */ +#define HAM_TYPE_REAL32 11 +/** An 64-bit double */ +#define HAM_TYPE_REAL64 12 + +/** + * @} + */ + + +/** + * @defgroup ham_status_codes hamsterdb Status Codes + * @{ + */ + +/** Operation completed successfully */ +#define HAM_SUCCESS ( 0) +/** Invalid record size */ +#define HAM_INV_RECORD_SIZE ( -2) +/** Invalid key size */ +#define HAM_INV_KEY_SIZE ( -3) +/* deprecated */ +#define HAM_INV_KEYSIZE HAM_INV_KEY_SIZE +/** Invalid page size (must be 1024 or a multiple of 2048) */ +#define HAM_INV_PAGE_SIZE ( -4) +/* deprecated */ +#define HAM_INV_PAGESIZE HAM_INV_PAGE_SIZE +/** Memory allocation failed - out of memory */ +#define HAM_OUT_OF_MEMORY ( -6) +/** Invalid function parameter */ +#define HAM_INV_PARAMETER ( -8) +/** Invalid file header */ +#define HAM_INV_FILE_HEADER ( -9) +/** Invalid file version */ +#define HAM_INV_FILE_VERSION (-10) +/** Key was not found */ +#define HAM_KEY_NOT_FOUND (-11) +/** Tried to insert a key which already exists */ +#define HAM_DUPLICATE_KEY (-12) +/** Internal Database integrity violated */ +#define HAM_INTEGRITY_VIOLATED (-13) +/** Internal hamsterdb error */ +#define HAM_INTERNAL_ERROR (-14) +/** Tried to modify the Database, but the file was opened as read-only */ +#define HAM_WRITE_PROTECTED (-15) +/** Database record not found */ +#define HAM_BLOB_NOT_FOUND (-16) +/** Generic file I/O error */ +#define HAM_IO_ERROR (-18) +/** Function is not yet implemented */ +#define HAM_NOT_IMPLEMENTED (-20) +/** File not found */ +#define HAM_FILE_NOT_FOUND (-21) +/** Operation would block */ +#define HAM_WOULD_BLOCK (-22) +/** Object was not initialized correctly */ +#define HAM_NOT_READY (-23) +/** Database limits reached */ +#define HAM_LIMITS_REACHED (-24) +/** Object was already initialized */ +#define HAM_ALREADY_INITIALIZED (-27) +/** Database needs recovery */ +#define HAM_NEED_RECOVERY (-28) +/** Cursor must be closed prior to Transaction abort/commit */ +#define HAM_CURSOR_STILL_OPEN (-29) +/** Record filter or file filter not found */ +#define HAM_FILTER_NOT_FOUND (-30) +/** Operation conflicts with another Transaction */ +#define HAM_TXN_CONFLICT (-31) +/* internal use: key was erased in a Transaction */ +#define HAM_KEY_ERASED_IN_TXN (-32) +/** Database cannot be closed because it is modified in a Transaction */ +#define HAM_TXN_STILL_OPEN (-33) +/** Cursor does not point to a valid item */ +#define HAM_CURSOR_IS_NIL (-100) +/** Database not found */ +#define HAM_DATABASE_NOT_FOUND (-200) +/** Database name already exists */ +#define HAM_DATABASE_ALREADY_EXISTS (-201) +/** Database already open, or: Database handle is already initialized */ +#define HAM_DATABASE_ALREADY_OPEN (-202) +/** Environment already open, or: Environment handle is already initialized */ +#define HAM_ENVIRONMENT_ALREADY_OPEN (-203) +/** Invalid log file header */ +#define HAM_LOG_INV_FILE_HEADER (-300) +/** Remote I/O error/Network error */ +#define HAM_NETWORK_ERROR (-400) + +/** + * @} + */ + + +/** + * @defgroup ham_static hamsterdb Static Functions + * @{ + */ + +/** + * A typedef for a custom error handler function + * + * This error handler can be used in combination with + * @ref ham_set_errhandler(). + * + * @param message The error message + * @param level The error level: + * <ul> + * <li>@ref HAM_DEBUG_LEVEL_DEBUG (0) </li> a debug message + * <li>@ref HAM_DEBUG_LEVEL_NORMAL (1) </li> a normal error message + * <li>2</li> reserved + * <li>@ref HAM_DEBUG_LEVEL_FATAL (3) </li> a fatal error message + * </ul> + * + * @sa error_levels + */ +typedef void HAM_CALLCONV (*ham_errhandler_fun)(int level, const char *message); + +/** A debug message */ +#define HAM_DEBUG_LEVEL_DEBUG 0 + +/** A normal error message */ +#define HAM_DEBUG_LEVEL_NORMAL 1 + +/** A fatal error message */ +#define HAM_DEBUG_LEVEL_FATAL 3 + +/** + * Sets the global error handler + * + * This handler will receive all debug messages that are emitted + * by hamsterdb. You can install the default handler by setting @a f to 0. + * + * The default error handler prints all messages to stderr. To install a + * different logging facility, you can provide your own error handler. + * + * Note that the callback function must have the same calling convention + * as the hamsterdb library. + * + * @param f A pointer to the error handler function, or NULL to restore + * the default handler + */ +HAM_EXPORT void HAM_CALLCONV +ham_set_errhandler(ham_errhandler_fun f); + +/** + * Translates a hamsterdb status code to a descriptive error string + * + * @param status The hamsterdb status code + * + * @return A pointer to a descriptive error string + */ +HAM_EXPORT const char * HAM_CALLCONV +ham_strerror(ham_status_t status); + +/** + * Returns the version of the hamsterdb library + * + * @param major If not NULL, will return the major version number + * @param minor If not NULL, will return the minor version number + * @param revision If not NULL, will return the revision version number + */ +HAM_EXPORT void HAM_CALLCONV +ham_get_version(uint32_t *major, uint32_t *minor, + uint32_t *revision); + +/** + * @} + */ + + +/** + * @defgroup ham_env hamsterdb Environment Functions + * @{ + */ + +/** + * Creates a Database Environment + * + * A Database Environment is a collection of Databases, which are all stored + * in one physical file (or in-memory). The maximum number of Databases + * depends on the page size; the default is above 600. + * + * Each Database in an Environment is identified by a positive 16bit + * value (except 0 and values at or above 0xf000). + * Databases in an Environment can be created with @ref ham_env_create_db + * or opened with @ref ham_env_open_db. + * + * Specify a URL instead of a filename (i.e. + * "ham://localhost:8080/customers.db") to access a remote hamsterdb Server. + * + * To enable ACID Transactions, supply the flag @ref HAM_ENABLE_TRANSACTIONS. + * By default, hamsterdb will use a Journal for recovering the Environment + * and its data in case of a crash, and also to re-apply committed Transactions + * which were not yet flushed to disk. This Journalling can be disabled + * with the flag @ref HAM_DISABLE_RECOVERY. (It is disabled if the Environment + * is in-memory.) + * + * If Transactions are not required, but hamsterdb should still be able to + * recover in case of a crash or power outage, then the flag + * @ref HAM_ENABLE_RECOVERY will enable the Journal (without allowing + * Transactions.) + * + * For performance reasons the Journal does not use fsync(2) (or + * FlushFileBuffers on Win32) to flush modified buffers to disk. Use the flag + * @ref HAM_ENABLE_FSYNC to force the use of fsync. + * + * @param env A pointer to an Environment handle + * @param filename The filename of the Environment file. If the file already + * exists, it is overwritten. Can be NULL for an In-Memory + * Environment. Can be a URL ("ham://<hostname>:<port>/<environment>") + * for remote access. + * @param flags Optional flags for opening the Environment, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_ENABLE_FSYNC</li> Flushes all file handles after + * committing or aborting a Transaction using fsync(), fdatasync() + * or FlushFileBuffers(). This file has no effect + * if Transactions are disabled. Slows down performance but makes + * sure that all file handles and operating system caches are + * transferred to disk, thus providing a stronger durability. + * <li>@ref HAM_IN_MEMORY</li> Creates an In-Memory Environment. No + * file will be created, and the Database contents are lost after + * the Environment is closed. The @a filename parameter can + * be NULL. Do <b>NOT</b> specify @a cache_size other than 0. + * <li>@ref HAM_DISABLE_MMAP</li> Do not use memory mapped files for I/O. + * By default, hamsterdb checks if it can use mmap, + * since mmap is faster than read/write. For performance + * reasons, this flag should not be used. + * <li>@ref HAM_CACHE_UNLIMITED</li> Do not limit the cache. Nearly as + * fast as an In-Memory Database. Not allowed in combination + * with a limited cache size. + * <li>@ref HAM_ENABLE_TRANSACTIONS</li> Enables Transactions for this + * Environment. This flag implies @ref HAM_ENABLE_RECOVERY. + * <li>@ref HAM_ENABLE_RECOVERY</li> Enables logging/recovery for this + * Environment. Not allowed in combination with @ref HAM_IN_MEMORY. + * <li>@ref HAM_DISABLE_RECOVERY</li> Disables logging/recovery for this + * Environment. + * <li>@ref HAM_FLUSH_WHEN_COMMITTED</li> Immediately flushes committed + * Transactions and writes them to the Btree. Disabled by default. If + * disabled then hamsterdb buffers committed Transactions and only starts + * flushing when too many Transactions were committed. + * </ul> + * + * @param mode File access rights for the new file. This is the @a mode + * parameter for creat(2). Ignored on Microsoft Windows. Default + * is 0644. + * @param param An array of ham_parameter_t structures. The following + * parameters are available: + * <ul> + * <li>@ref HAM_PARAM_CACHE_SIZE</li> The size of the Database cache, + * in bytes. The default size is defined in src/config.h + * as @a HAM_DEFAULT_CACHE_SIZE - usually 2MB + * <li>@ref HAM_PARAM_POSIX_FADVISE</li> Sets the "advice" for + * posix_fadvise(). Only on supported platforms. Allowed values are + * @ref HAM_POSIX_FADVICE_NORMAL (which is the default) or + * @ref HAM_POSIX_FADVICE_RANDOM. + * <li>@ref HAM_PARAM_PAGE_SIZE</li> The size of a file page, in + * bytes. It is recommended not to change the default size. The + * default size depends on hardware and operating system. + * Page sizes must be 1024 or a multiple of 2048. + * <li>@ref HAM_PARAM_FILE_SIZE_LIMIT</li> Sets a file size limit (in bytes). + * Disabled by default. Not allowed in combination with @ref HAM_IN_MEMORY. + * If the limit is exceeded, API functions return @ref HAM_LIMITS_REACHED. + * <li>@ref HAM_PARAM_LOG_DIRECTORY</li> The path of the log file + * and the journal files; default is the same path as the database + * file. Ignored for remote Environments. + * <li>@ref HAM_PARAM_NETWORK_TIMEOUT_SEC</li> Timeout (in seconds) when + * waiting for data from a remote server. By default, no timeout is set. + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL or an + * invalid combination of flags or parameters was specified + * @return @ref HAM_IO_ERROR if the file could not be opened or + * reading/writing failed + * @return @ref HAM_INV_FILE_VERSION if the Environment version is not + * compatible with the library version + * @return @ref HAM_OUT_OF_MEMORY if memory could not be allocated + * @return @ref HAM_INV_PAGE_SIZE if @a page_size is not 1024 or + * a multiple of 2048 + * @return @ref HAM_INV_KEY_SIZE if @a key_size is too large (at least 4 + * keys must fit in a page) + * @return @ref HAM_WOULD_BLOCK if another process has locked the file + * @return @ref HAM_ENVIRONMENT_ALREADY_OPEN if @a env is already in use + * + * @sa ham_env_create + * @sa ham_env_close + * @sa ham_env_open + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_create(ham_env_t **env, const char *filename, + uint32_t flags, uint32_t mode, const ham_parameter_t *param); + +/** + * Opens an existing Database Environment + * + * This function opens an existing Database Environment. + * + * A Database Environment is a collection of Databases, which are all stored + * in one physical file (or in-memory). + * + * Each Database in an Environment is identified by a positive 16bit + * value (except 0 and values at or above 0xf000). + * Databases in an Environment can be created with @ref ham_env_create_db + * or opened with @ref ham_env_open_db. + * + * Specify a URL instead of a filename (i.e. + * "ham://localhost:8080/customers.db") to access a remote hamsterdb Server. + * + * Also see the documentation @ref ham_env_create about Transactions, Recovery + * and the use of fsync. + * + * @param env A valid Environment handle + * @param filename The filename of the Environment file, or URL of a hamsterdb + * Server + * @param flags Optional flags for opening the Environment, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_READ_ONLY </li> Opens the file for reading only. + * Operations that need write access (i.e. @ref ham_db_insert) will + * return @ref HAM_WRITE_PROTECTED. + * <li>@ref HAM_ENABLE_FSYNC</li> Flushes all file handles after + * committing or aborting a Transaction using fsync(), fdatasync() + * or FlushFileBuffers(). This file has no effect + * if Transactions are disabled. Slows down performance but makes + * sure that all file handles and operating system caches are + * transferred to disk, thus providing a stronger durability. + * <li>@ref HAM_DISABLE_MMAP </li> Do not use memory mapped files for I/O. + * By default, hamsterdb checks if it can use mmap, + * since mmap is faster than read/write. For performance + * reasons, this flag should not be used. + * <li>@ref HAM_CACHE_UNLIMITED </li> Do not limit the cache. Nearly as + * fast as an In-Memory Database. Not allowed in combination + * with a limited cache size. + * <li>@ref HAM_ENABLE_TRANSACTIONS </li> Enables Transactions for this + * Environment. This flag imples @ref HAM_ENABLE_RECOVERY. + * <li>@ref HAM_ENABLE_RECOVERY </li> Enables logging/recovery for this + * Environment. Will return @ref HAM_NEED_RECOVERY, if the Environment + * is in an inconsistent state. Not allowed in combination + * with @ref HAM_IN_MEMORY. + * <li>@ref HAM_DISABLE_RECOVERY</li> Disables logging/recovery for this + * Environment. + * <li>@ref HAM_AUTO_RECOVERY </li> Automatically recover the Environment, + * if necessary. This flag implies @ref HAM_ENABLE_RECOVERY. + * <li>@ref HAM_FLUSH_WHEN_COMMITTED</li> Immediately flushes committed + * Transactions and writes them to the Btree. Disabled by default. If + * disabled then hamsterdb buffers committed Transactions and only starts + * flushing when too many Transactions were committed. + * </ul> + * @param param An array of ham_parameter_t structures. The following + * parameters are available: + * <ul> + * <li>@ref HAM_PARAM_CACHE_SIZE </li> The size of the Database cache, + * in bytes. The default size is defined in src/config.h + * as @a HAM_DEFAULT_CACHE_SIZE - usually 2MB + * <li>@ref HAM_PARAM_POSIX_FADVISE</li> Sets the "advice" for + * posix_fadvise(). Only on supported platforms. Allowed values are + * @ref HAM_POSIX_FADVICE_NORMAL (which is the default) or + * @ref HAM_POSIX_FADVICE_RANDOM. + * <li>@ref HAM_PARAM_FILE_SIZE_LIMIT</li> Sets a file size limit (in bytes). + * Disabled by default. If the limit is exceeded, API functions + * return @ref HAM_LIMITS_REACHED. + * <li>@ref HAM_PARAM_LOG_DIRECTORY</li> The path of the log file + * and the journal files; default is the same path as the database + * file. Ignored for remote Environments. + * <li>@ref HAM_PARAM_NETWORK_TIMEOUT_SEC</li> Timeout (in seconds) when + * waiting for data from a remote server. By default, no timeout is set. + * </ul> + * + * @return @ref HAM_SUCCESS upon success. + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL, an + * invalid combination of flags was specified + * @return @ref HAM_FILE_NOT_FOUND if the file does not exist + * @return @ref HAM_IO_ERROR if the file could not be opened or reading failed + * @return @ref HAM_INV_FILE_VERSION if the Environment version is not + * compatible with the library version. + * @return @ref HAM_OUT_OF_MEMORY if memory could not be allocated + * @return @ref HAM_WOULD_BLOCK if another process has locked the file + * @return @ref HAM_NEED_RECOVERY if the Database is in an inconsistent state + * @return @ref HAM_LOG_INV_FILE_HEADER if the logfile is corrupt + * @return @ref HAM_ENVIRONMENT_ALREADY_OPEN if @a env is already in use + * @return @ref HAM_NETWORK_ERROR if a remote server is not reachable + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_open(ham_env_t **env, const char *filename, + uint32_t flags, const ham_parameter_t *param); + +/** + * Retrieve the current value for a given Environment setting + * + * Only those values requested by the parameter array will be stored. + * + * The following parameters are supported: + * <ul> + * <li>HAM_PARAM_CACHE_SIZE</li> returns the cache size + * <li>HAM_PARAM_PAGE_SIZE</li> returns the page size + * <li>HAM_PARAM_MAX_DATABASES</li> returns the max. number of + * Databases of this Database's Environment + * <li>HAM_PARAM_FLAGS</li> returns the flags which were used to + * open or create this Database + * <li>HAM_PARAM_FILEMODE</li> returns the @a mode parameter which + * was specified when creating this Database + * <li>HAM_PARAM_FILENAME</li> returns the filename (the @a value + * of this parameter is a const char * pointer casted to a + * uint64_t variable) + * <li>@ref HAM_PARAM_LOG_DIRECTORY</li> The path of the log file + * and the journal files. Ignored for remote Environments. + * <li>@ref HAM_PARAM_JOURNAL_COMPRESSION</li> Returns the + * selected algorithm for journal compression, or 0 if compression + * is disabled + * </ul> + * + * @param env A valid Environment handle + * @param param An array of ham_parameter_t structures + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL or + * @a param is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_get_parameters(ham_env_t *env, ham_parameter_t *param); + +/** + * Creates a new Database in a Database Environment + * + * An Environment can contain a (limited) amount of Databases; the exact + * limit depends on the page size and is above 600. + * + * Each Database in an Environment is identified by a positive 16bit + * value. 0 and values at or above 0xf000 are reserved. + * + * This function initializes the ham_db_t handle (the second parameter). + * When the handle is no longer in use, it should be closed with + * @ref ham_db_close. Alternatively, the Database handle is closed + * automatically if @ref ham_env_close is called with the flag + * @ref HAM_AUTO_CLEANUP. + * + * A Database can (and should) be configured and optimized for the data that + * is inserted. The data is described through flags and parameters. hamsterdb + * differentiates between several data characteristics, and offers predefined + * "types" to describe the keys. In general, the default key type + * (@ref HAM_TYPE_BINARY) is slower than the other types, and + * fixed-length binary keys (@ref HAM_TYPE_BINARY in combination with + * @ref HAM_PARAM_KEY_SIZE) is faster than variable-length binary + * keys. It is therefore recommended to always set the key size and record size, + * although it is not required. + * + * Internally, hamsterdb uses two different layouts ("default" and "pax) + * depending on the settings specified by the user. The "default" layout + * is enabled for variable-length keys or if duplicate keys are enabled. + * For fixed-length keys (without duplicates) the "pax" layout is chosen. + * The "pax" layout is more compact and usually faster. + * + * A word of warning regarding the use of fixed length binary keys + * (@ref HAM_TYPE_CUSTOM or @ref HAM_TYPE_BINARY in combination with + * @ref HAM_PARAM_KEY_SIZE): if your key size is too large, only few keys + * will fit in a Btree node. The Btree fanout will be very high, which will + * decrease performance. In such cases it might be better to NOT specify + * the key size; then hamsterdb will store keys as blobs if they are too large. + * + * See the Wiki documentation for <a href= + "https://github.com/cruppstahl/hamsterdb/wiki/Evaluating-and-Benchmarking"> + * Evaluating and Benchmarking</a> on how to test different configurations and + * optimize for performance. + * + * The key type is set with @ref HAM_PARAM_KEY_TYPE and can have either + * of the following values: + * + * <ul> + * <li>HAM_TYPE_BINARY</li> This is the default key type: a binary blob. + * Internally, hamsterdb uses memcmp(3) for the sort order. Key size depends + * on @ref HAM_PARAM_KEY_SIZE and is unlimited (@ref HAM_KEY_SIZE_UNLIMITED) + * by default. + * <li>HAM_TYPE_CUSTOM</li> Similar to @ref HAM_TYPE_BINARY, but + * uses a callback function for the sort order. This function is supplied + * by the application with @sa ham_db_set_compare_func. + * <li>HAM_TYPE_UINT8</li> Key is a 8bit (1 byte) unsigned integer + * <li>HAM_TYPE_UINT16</li> Key is a 16bit (2 byte) unsigned integer + * <li>HAM_TYPE_UINT32</li> Key is a 32bit (4 byte) unsigned integer + * <li>HAM_TYPE_UINT64</li> Key is a 64bit (8 byte) unsigned integer + * <li>HAM_TYPE_REAL32</li> Key is a 32bit (4 byte) float + * <li>HAM_TYPE_REAL64</li> Key is a 64bit (8 byte) double + * </ul> + * + * If the key type is ommitted then @ref HAM_TYPE_BINARY is the default. + * + * If binary/custom keys are so big that they cannot be stored in the Btree, + * then the full key will be stored in an overflow area, which has + * performance implications when accessing such keys. + * + * In addition to the flags above, you can specify @a HAM_ENABLE_DUPLICATE_KEYS + * to insert duplicate keys, i.e. to model 1:n or n:m relationships. + * + * If the size of the records is always constant, then + * @ref HAM_PARAM_RECORD_SIZE should be used to specify this size. This allows + * hamsterdb to optimize the record storage, and small records will + * automatically be stored in the Btree's leaf nodes instead of a separately + * allocated blob, allowing faster access. + * A record size of 0 is valid and suited for boolean values ("key exists" + * vs "key doesn't exist"). The default record size is + * @ref HAM_RECORD_SIZE_UNLIMITED. + * + * @param env A valid Environment handle. + * @param db A valid Database handle, which will point to the created + * Database. To close the handle, use @ref ham_db_close. + * @param name The name of the Database. If a Database with this name + * already exists, the function will fail with + * @ref HAM_DATABASE_ALREADY_EXISTS. Database names from 0xf000 to + * 0xffff and 0 are reserved. + * @param flags Optional flags for creating the Database, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_ENABLE_DUPLICATE_KEYS </li> Enable duplicate keys for this + * Database. By default, duplicate keys are disabled. + * <li>@ref HAM_RECORD_NUMBER32 </li> Creates an "auto-increment" Database. + * Keys in Record Number Databases are automatically assigned an + * incrementing 32bit value. If key->data is not NULL + * (and key->flags is @ref HAM_KEY_USER_ALLOC), the value of the current + * key is returned in @a key. If key-data is NULL and key->size is 0, + * key->data is temporarily allocated by hamsterdb. + * <li>@ref HAM_RECORD_NUMBER64 </li> Creates an "auto-increment" Database. + * Keys in Record Number Databases are automatically assigned an + * incrementing 64bit value. If key->data is not NULL + * (and key->flags is @ref HAM_KEY_USER_ALLOC), the value of the current + * key is returned in @a key. If key-data is NULL and key->size is 0, + * key->data is temporarily allocated by hamsterdb. + * </ul> + * + * @param params An array of ham_parameter_t structures. The following + * parameters are available: + * <ul> + * <li>@ref HAM_PARAM_KEY_TYPE </li> The type of the keys in the B+Tree + * index. The default is @ref HAM_TYPE_BINARY. See above for more + * information. + * <li>@ref HAM_PARAM_KEY_SIZE </li> The (fixed) size of the keys in + * the B+Tree index; or @ref HAM_KEY_SIZE_UNLIMITED for unlimited and + * variable keys (this is the default). + * <li>@ref HAM_PARAM_RECORD_SIZE </li> The (fixed) size of the records; + * or @ref HAM_RECORD_SIZE_UNLIMITED if there was no fixed record size + * specified (this is the default). + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL or an + * invalid combination of flags was specified + * @return @ref HAM_DATABASE_ALREADY_EXISTS if a Database with this @a name + * already exists in this Environment + * @return @ref HAM_OUT_OF_MEMORY if memory could not be allocated + * @return @ref HAM_LIMITS_REACHED if the maximum number of Databases per + * Environment was already created + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_create_db(ham_env_t *env, ham_db_t **db, + uint16_t name, uint32_t flags, const ham_parameter_t *params); + +/** + * Opens a Database in a Database Environment + * + * Each Database in an Environment is identified by a positive 16bit + * value (except 0 and values at or above 0xf000). + * + * This function initializes the ham_db_t handle (the second parameter). + * When the handle is no longer in use, it should be closed with + * @ref ham_db_close. Alternatively, the Database handle is closed + * automatically if @ref ham_env_close is called with the flag + * @ref HAM_AUTO_CLEANUP. + * + * @param env A valid Environment handle + * @param db A valid Database handle, which will point to the opened + * Database. To close the handle, use @see ham_db_close. + * @param name The name of the Database. If a Database with this name + * does not exist, the function will fail with + * @ref HAM_DATABASE_NOT_FOUND. + * @param flags Optional flags for opening the Database, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_READ_ONLY </li> Opens the Database for reading only. + * Operations that need write access (i.e. @ref ham_db_insert) will + * return @ref HAM_WRITE_PROTECTED. + * </ul> + * @param params Reserved; set to NULL + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL or an + * invalid combination of flags was specified + * @return @ref HAM_DATABASE_NOT_FOUND if a Database with this @a name + * does not exist in this Environment. + * @return @ref HAM_DATABASE_ALREADY_OPEN if this Database was already + * opened + * @return @ref HAM_OUT_OF_MEMORY if memory could not be allocated + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_open_db(ham_env_t *env, ham_db_t **db, + uint16_t name, uint32_t flags, const ham_parameter_t *params); + +/** + * Renames a Database in an Environment. + * + * @param env A valid Environment handle. + * @param oldname The old name of the existing Database. If a Database + * with this name does not exist, the function will fail with + * @ref HAM_DATABASE_NOT_FOUND. + * @param newname The new name of this Database. If a Database + * with this name already exists, the function will fail with + * @ref HAM_DATABASE_ALREADY_EXISTS. + * @param flags Optional flags for renaming the Database, combined with + * bitwise OR; unused, set to 0. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL or if + * the new Database name is reserved + * @return @ref HAM_DATABASE_NOT_FOUND if a Database with this @a name + * does not exist in this Environment + * @return @ref HAM_DATABASE_ALREADY_EXISTS if a Database with the new name + * already exists + * @return @ref HAM_OUT_OF_MEMORY if memory could not be allocated + * @return @ref HAM_NOT_READY if the Environment @a env was not initialized + * correctly (i.e. not yet opened or created) + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_rename_db(ham_env_t *env, uint16_t oldname, + uint16_t newname, uint32_t flags); + +/** + * Deletes a Database from an Environment + * + * @param env A valid Environment handle + * @param name The name of the Database to delete. If a Database + * with this name does not exist, the function will fail with + * @ref HAM_DATABASE_NOT_FOUND. If the Database was already opened, + * the function will fail with @ref HAM_DATABASE_ALREADY_OPEN. + * @param flags Optional flags for deleting the Database; unused, set to 0. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a env pointer is NULL or if + * the new Database name is reserved + * @return @ref HAM_DATABASE_NOT_FOUND if a Database with this @a name + * does not exist + * @return @ref HAM_DATABASE_ALREADY_OPEN if a Database with this name is + * still open + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_erase_db(ham_env_t *env, uint16_t name, uint32_t flags); + +/* internal flag - only flush committed transactions, not the btree pages */ +#define HAM_FLUSH_COMMITTED_TRANSACTIONS 1 + +/** + * Flushes the Environment + * + * This function flushes the Environment caches and writes the whole file + * to disk. All Databases of this Environment are flushed as well. + * + * Since In-Memory Databases do not have a file on disk, the + * function will have no effect and will return @ref HAM_SUCCESS. + * + * @param env A valid Environment handle + * @param flags Optional flags for flushing; unused, set to 0 + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_flush(ham_env_t *env, uint32_t flags); + +/* internal use only - don't lock mutex */ +#define HAM_DONT_LOCK 0xf0000000 + +/** + * Returns the names of all Databases in an Environment + * + * This function returns the names of all Databases and the number of + * Databases in an Environment. + * + * The memory for @a names must be allocated by the user. @a count + * must be the size of @a names when calling the function, and will be + * the number of Databases when the function returns. The function returns + * @ref HAM_LIMITS_REACHED if @a names is not big enough; in this case, the + * caller should resize the array and call the function again. + * + * @param env A valid Environment handle + * @param names Pointer to an array for the Database names + * @param count Pointer to the size of the array; will be used to store the + * number of Databases when the function returns. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a env, @a names or @a count is NULL + * @return @ref HAM_LIMITS_REACHED if @a names is not large enough to hold + * all Database names + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_get_database_names(ham_env_t *env, uint16_t *names, + uint32_t *count); + +/** + * Closes the Database Environment + * + * This function closes the Database Environment. It also frees the + * memory resources allocated in the @a env handle, and tries to truncate + * the file (see below). + * + * If the flag @ref HAM_AUTO_CLEANUP is specified, hamsterdb automatically + * calls @ref ham_db_close with flag @ref HAM_AUTO_CLEANUP on all open + * Databases (which closes all open Databases and their Cursors). This + * invalidates the ham_db_t and ham_cursor_t handles! + * + * If the flag is not specified, the application must close all Database + * handles with @ref ham_db_close to prevent memory leaks. + * + * This function also aborts all Transactions which were not yet committed, + * and therefore renders all Transaction handles invalid. If the flag + * @ref HAM_TXN_AUTO_COMMIT is specified, all Transactions will be committed. + * + * This function also tries to truncate the file and "cut off" unused space + * at the end of the file to reduce the file size. This feature is disabled + * on Win32 if memory mapped I/O is used (see @ref HAM_DISABLE_MMAP). + * + * @param env A valid Environment handle + * @param flags Optional flags for closing the handle. Possible flags are: + * <ul> + * <li>@ref HAM_AUTO_CLEANUP. Calls @ref ham_db_close with the flag + * @ref HAM_AUTO_CLEANUP on every open Database + * <li>@ref HAM_TXN_AUTO_COMMIT. Automatically commit all open + * Transactions + * <li>@ref HAM_TXN_AUTO_ABORT. Automatically abort all open + * Transactions; this is the default behaviour + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a env is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_close(ham_env_t *env, uint32_t flags); + +/** + * @} + */ + + +/** + * @defgroup ham_txn hamsterdb Transaction Functions + * @{ + */ + +/** + * The hamsterdb Transaction structure + * + * This structure is allocated with @ref ham_txn_begin and deleted with + * @ref ham_txn_commit or @ref ham_txn_abort. + */ +struct ham_txn_t; +typedef struct ham_txn_t ham_txn_t; + +/** + * Begins a new Transaction + * + * A Transaction is an atomic sequence of Database operations. With @ref + * ham_txn_begin such a new sequence is started. To write all operations of this + * sequence to the Database use @ref ham_txn_commit. To abort and cancel + * this sequence use @ref ham_txn_abort. + * + * In order to use Transactions, the Environment has to be created or + * opened with the flag @ref HAM_ENABLE_TRANSACTIONS. + * + * You can create as many Transactions as you want (older versions of + * hamsterdb did not allow to create more than one Transaction in parallel). + * + * @param txn Pointer to a pointer of a Transaction structure + * @param env A valid Environment handle + * @param name An optional Transaction name + * @param reserved A reserved pointer; always set to NULL + * @param flags Optional flags for beginning the Transaction, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_TXN_READ_ONLY </li> This Transaction is read-only and + * will not modify the Database. + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_OUT_OF_MEMORY if memory allocation failed + */ +HAM_EXPORT ham_status_t +ham_txn_begin(ham_txn_t **txn, ham_env_t *env, const char *name, + void *reserved, uint32_t flags); + +/** Flag for @ref ham_txn_begin */ +#define HAM_TXN_READ_ONLY 1 + +/* Internal flag for @ref ham_txn_begin */ +#define HAM_TXN_TEMPORARY 2 + +/** + * Retrieves the Transaction name + * + * @returns NULL if the name was not assigned or if @a txn is invalid + */ +HAM_EXPORT const char * +ham_txn_get_name(ham_txn_t *txn); + +/** + * Commits a Transaction + * + * This function applies the sequence of Database operations. + * + * Note that the function will fail with @ref HAM_CURSOR_STILL_OPEN if + * a Cursor was attached to this Transaction (with @ref ham_cursor_create + * or @ref ham_cursor_clone), and the Cursor was not closed. + * + * @param txn Pointer to a Transaction structure + * @param flags Optional flags for committing the Transaction, combined with + * bitwise OR. Unused, set to 0. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_IO_ERROR if writing to the file failed + * @return @ref HAM_CURSOR_STILL_OPEN if there are Cursors attached to this + * Transaction + */ +HAM_EXPORT ham_status_t +ham_txn_commit(ham_txn_t *txn, uint32_t flags); + +/** + * Aborts a Transaction + * + * This function aborts (= cancels) the sequence of Database operations. + * + * Note that the function will fail with @ref HAM_CURSOR_STILL_OPEN if + * a Cursor was attached to this Transaction (with @ref ham_cursor_create + * or @ref ham_cursor_clone), and the Cursor was not closed. + * + * @param txn Pointer to a Transaction structure + * @param flags Optional flags for aborting the Transaction, combined with + * bitwise OR. Unused, set to 0. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_IO_ERROR if writing to the Database file or logfile failed + * @return @ref HAM_CURSOR_STILL_OPEN if there are Cursors attached to this + * Transaction + */ +HAM_EXPORT ham_status_t +ham_txn_abort(ham_txn_t *txn, uint32_t flags); + +/** + * @} + */ + + +/** + * @defgroup ham_database hamsterdb Database Functions + * @{ + */ + +/** Flag for @ref ham_env_open, @ref ham_env_create. + * This flag is non persistent. */ +#define HAM_ENABLE_FSYNC 0x00000001 + +/* unused 0x00000002 */ + +/** Flag for @ref ham_env_open, @ref ham_env_open_db. + * This flag is non persistent. */ +#define HAM_READ_ONLY 0x00000004 + +/* unused 0x00000008 */ + +/* unused 0x00000010 */ + +/* reserved 0x00000020 */ + +/* unused 0x00000040 */ + +/** Flag for @ref ham_env_create. + * This flag is non persistent. */ +#define HAM_IN_MEMORY 0x00000080 + +/* reserved: DB_USE_MMAP (not persistent) 0x00000100 */ + +/** Flag for @ref ham_env_open, @ref ham_env_create. + * This flag is non persistent. */ +#define HAM_DISABLE_MMAP 0x00000200 + +/* deprecated */ +#define HAM_RECORD_NUMBER HAM_RECORD_NUMBER64 + +/** Flag for @ref ham_env_create_db. + * This flag is persisted in the Database. */ +#define HAM_RECORD_NUMBER32 0x00001000 + +/** Flag for @ref ham_env_create_db. + * This flag is persisted in the Database. */ +#define HAM_RECORD_NUMBER64 0x00002000 + +/** Flag for @ref ham_env_create_db. + * This flag is persisted in the Database. */ +#define HAM_ENABLE_DUPLICATE_KEYS 0x00004000 +/* deprecated */ +#define HAM_ENABLE_DUPLICATES HAM_ENABLE_DUPLICATE_KEYS + +/** Flag for @ref ham_env_create, @ref ham_env_open. + * This flag is non persistent. */ +#define HAM_ENABLE_RECOVERY 0x00008000 + +/** Flag for @ref ham_env_open. + * This flag is non persistent. */ +#define HAM_AUTO_RECOVERY 0x00010000 + +/** Flag for @ref ham_env_create, @ref ham_env_open. + * This flag is non persistent. */ +#define HAM_ENABLE_TRANSACTIONS 0x00020000 + +/** Flag for @ref ham_env_open, @ref ham_env_create. + * This flag is non persistent. */ +#define HAM_CACHE_UNLIMITED 0x00040000 + +/** Flag for @ref ham_env_create, @ref ham_env_open. + * This flag is non persistent. */ +#define HAM_DISABLE_RECOVERY 0x00080000 + +/* internal use only! (not persistent) */ +#define HAM_IS_REMOTE_INTERNAL 0x00200000 + +/* internal use only! (not persistent) */ +#define HAM_DISABLE_RECLAIM_INTERNAL 0x00400000 + +/* internal use only! (persistent) */ +#define HAM_FORCE_RECORDS_INLINE 0x00800000 + +/** Flag for @ref ham_env_open, @ref ham_env_create. + * This flag is non persistent. */ +#define HAM_FLUSH_WHEN_COMMITTED 0x01000000 + +/** Pro: Flag for @ref ham_env_open, @ref ham_env_create. + * This flag is non persistent. */ +#define HAM_ENABLE_CRC32 0x02000000 + +/** + * Returns the last error code + * + * @note This API is deprecated! It will be removed in one of the + * next versions. + * + * @param db A valid Database handle + * + * @return The last error code which was returned by one of the + * hamsterdb API functions. Use @ref ham_strerror to translate + * this code to a descriptive string + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_get_error(ham_db_t *db); + +/** + * Typedef for a key comparison function + * + * @remark This function compares two index keys. It returns -1, if @a lhs + * ("left-hand side", the parameter on the left side) is smaller than + * @a rhs ("right-hand side"), 0 if both keys are equal, and 1 if @a lhs + * is larger than @a rhs. + */ +typedef int HAM_CALLCONV (*ham_compare_func_t)(ham_db_t *db, + const uint8_t *lhs, uint32_t lhs_length, + const uint8_t *rhs, uint32_t rhs_length); + +/** + * Sets the comparison function + * + * The comparison function compares two index keys. It returns -1 if the + * first key is smaller, +1 if the second key is smaller or 0 if both + * keys are equal. + * + * Supplying a comparison function is only allowed for the key type + * @ref HAM_TYPE_CUSTOM; see the documentation of @sa ham_env_create_db + * for more information. + * + * @param db A valid Database handle + * @param foo A pointer to the compare function + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + * @return @ref HAM_INV_PARAMETER if the database's key type was not + * specified as @ref HAM_TYPE_CUSTOM + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_set_compare_func(ham_db_t *db, ham_compare_func_t foo); + +/** + * Searches an item in the Database + * + * This function searches the Database for @a key. If the key + * is found, @a record will receive the record of this item and + * @ref HAM_SUCCESS is returned. If the key is not found, the function + * returns @ref HAM_KEY_NOT_FOUND. + * + * A ham_record_t structure should be initialized with + * zeroes before it is being used. This can be done with the C library + * routines memset(3) or bzero(2). + * + * If the function completes successfully, the @a record pointer is + * initialized with the size of the record (in @a record.size) and the + * actual record data (in @a record.data). If the record is empty, + * @a size is 0 and @a data points to NULL. + * + * The @a data pointer is a temporary pointer and will be overwritten + * by subsequent hamsterdb API calls using the same Transaction + * (or, if Transactions are disabled, using the same Database). + * You can alter this behaviour by allocating the @a data pointer in + * the application and setting @a record.flags to @ref HAM_RECORD_USER_ALLOC. + * Make sure that the allocated buffer is large enough. + * + * When specifying @ref HAM_DIRECT_ACCESS, the @a data pointer will point + * directly to the record that is stored in hamsterdb; the data can be modified, + * but the pointer must not be reallocated or freed. The flag @ref + * HAM_DIRECT_ACCESS is only allowed in In-Memory Databases and not if + * Transactions are enabled. + * + * @ref ham_db_find can not search for duplicate keys. If @a key has + * multiple duplicates, only the first duplicate is returned. + * + * You can read only portions of the record by specifying the flag + * @ref HAM_PARTIAL. In this case, hamsterdb will read + * <b>record->partial_size</b> bytes of the record data at offset + * <b>record->partial_offset</b>. If necessary, the record data will + * be limited to the original record size. The number of actually read + * bytes is returned in <b>record->partial_size</b>. The original size of + * the record is stored in <b>record->size</b>. + * + * @ref HAM_PARTIAL is not allowed if record->size is <= 8 or if Transactions + * are enabled. In such a case, @ref HAM_INV_PARAMETER is returned. + * + * If Transactions are enabled (see @ref HAM_ENABLE_TRANSACTIONS) and + * @a txn is NULL then hamsterdb will create a temporary Transaction. + * When moving the Cursor, and the new key is currently modified in an + * active Transaction (one that is not yet committed or aborted) then + * hamsterdb will skip this key and move to the next/previous one. However if + * @a flags are 0 (and the Cursor is not moved), and @a key or @a rec + * is NOT NULL, then hamsterdb will return error @ref HAM_TXN_CONFLICT. + * + * @param db A valid Database handle + * @param txn A Transaction handle, or NULL + * @param key The key of the item + * @param record The record of the item + * @param flags Optional flags for searching, which can be combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_FIND_LT_MATCH </li> Cursor 'find' flag 'Less Than': the + * cursor is moved to point at the last record which' key + * is less than the specified key. When such a record cannot + * be located, an error is returned. + * <li>@ref HAM_FIND_GT_MATCH </li> Cursor 'find' flag 'Greater Than': + * the cursor is moved to point at the first record which' key is + * larger than the specified key. When such a record cannot be + * located, an error is returned. + * <li>@ref HAM_FIND_LEQ_MATCH </li> Cursor 'find' flag 'Less or EQual': + * the cursor is moved to point at the record which' key matches + * the specified key and when such a record is not available + * the cursor is moved to point at the last record which' key + * is less than the specified key. When such a record cannot be + * located, an error is returned. + * <li>@ref HAM_FIND_GEQ_MATCH </li> Cursor 'find' flag 'Greater or + * Equal': the cursor is moved to point at the record which' key + * matches the specified key and when such a record + * is not available the cursor is moved to point at the first + * record which' key is larger than the specified key. + * When such a record cannot be located, an error is returned. + * <li>@ref HAM_FIND_NEAR_MATCH </li> Cursor 'find' flag 'Any Near Or + * Equal': the cursor is moved to point at the record which' + * key matches the specified key and when such a record is + * not available the cursor is moved to point at either the + * last record which' key is less than the specified key or + * the first record which' key is larger than the specified + * key, whichever of these records is located first. + * When such records cannot be located, an error is returned. + * <li>@ref HAM_DIRECT_ACCESS </li> Only for In-Memory Databases + * and not if Transactions are enabled! + * Returns a direct pointer to the data blob stored by the + * hamsterdb engine. This pointer must not be resized or freed, + * but the data in this memory can be modified. + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db, @a key or @a record is NULL + * @return @ref HAM_INV_PARAMETER if @a HAM_DIRECT_ACCESS is specified, + * but the Database is not an In-Memory Database. + * @return @ref HAM_INV_PARAMETER if @a HAM_DIRECT_ACCESS and + * @a HAM_ENABLE_TRANSACTIONS were both specified. + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is set but record + * size is <= 8 or Transactions are enabled + * @return @ref HAM_KEY_NOT_FOUND if the @a key does not exist + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + * + * @remark When either or both @ref HAM_FIND_LT_MATCH and/or @ref + * HAM_FIND_GT_MATCH have been specified as flags, the @a key structure + * will be overwritten when an approximate match was found: the + * @a key and @a record structures will then point at the located + * @a key and @a record. In this case the caller should ensure @a key + * points at a structure which must adhere to the same restrictions + * and conditions as specified for @ref ham_cursor_move(..., + * HAM_CURSOR_NEXT). + * + * @sa HAM_RECORD_USER_ALLOC + * @sa HAM_KEY_USER_ALLOC + * @sa ham_record_t + * @sa ham_key_t + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_find(ham_db_t *db, ham_txn_t *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags); + +/** + * Inserts a Database item + * + * This function inserts a key/record pair as a new Database item. + * + * If the key already exists in the Database, error @ref HAM_DUPLICATE_KEY + * is returned. + * + * If you wish to overwrite an existing entry specify the + * flag @ref HAM_OVERWRITE. + * + * You can write only portions of the record by specifying the flag + * @ref HAM_PARTIAL. In this case, hamsterdb will write <b>partial_size</b> + * bytes of the record data at offset <b>partial_offset</b>. The full record + * size will always be given in <b>record->size</b>! If + * partial_size+partial_offset exceed record->size then partial_size will + * be limited. To shrink or grow the record, adjust record->size. + * @ref HAM_PARTIAL automatically overwrites existing records. + * Gaps will be filled with null-bytes if the record did not yet exist. + * + * @ref HAM_PARTIAL is not allowed if record->size is <= 8 or if Transactions + * are enabled. In such a case, @ref HAM_INV_PARAMETER is returned. + * + * If you wish to insert a duplicate key specify the flag @ref HAM_DUPLICATE. + * (Note that the Database has to be created with @ref HAM_ENABLE_DUPLICATE_KEYS + * in order to use duplicate keys.) + * The duplicate key is inserted after all other duplicate keys (see + * @ref HAM_DUPLICATE_INSERT_LAST). + * + * Record Number Databases (created with @ref HAM_RECORD_NUMBER32 or + * @ref HAM_RECORD_NUMBER64) expect either an empty @a key (with a size of + * 0 and data pointing to NULL), or a user-supplied key (with key.flag + * @ref HAM_KEY_USER_ALLOC and a valid data pointer). + * If key.size is 0 and key.data is NULL, hamsterdb will temporarily + * allocate memory for key->data, which will then point to an 4-byte (or 8-byte) + * unsigned integer. + * + * For very fast sequential inserts please use @ref ham_cursor_insert in + * combination with the flag @ref HAM_HINT_APPEND. + * + * @param db A valid Database handle + * @param txn A Transaction handle, or NULL + * @param key The key of the new item + * @param record The record of the new item + * @param flags Optional flags for inserting. Possible flags are: + * <ul> + * <li>@ref HAM_OVERWRITE. If the @a key already exists, the record is + * overwritten. Otherwise, the key is inserted. Flag is not + * allowed in combination with @ref HAM_DUPLICATE. + * <li>@ref HAM_DUPLICATE. If the @a key already exists, a duplicate + * key is inserted. The key is inserted before the already + * existing key, or according to the sort order. Flag is not + * allowed in combination with @ref HAM_OVERWRITE. + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db, @a key or @a record is NULL + * @return @ref HAM_INV_PARAMETER if the Database is a Record Number Database + * and the key is invalid (see above) + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is set but record + * size is <= 8 or Transactions are enabled + * @return @ref HAM_INV_PARAMETER if the flags @ref HAM_OVERWRITE <b>and</b> + * @ref HAM_DUPLICATE were specified, or if @ref HAM_DUPLICATE + * was specified, but the Database was not created with + * flag @ref HAM_ENABLE_DUPLICATE_KEYS. + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is specified and + * record->partial_offset+record->partial_size exceeds the + * record->size + * @return @ref HAM_WRITE_PROTECTED if you tried to insert a key in a read-only + * Database + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + * @return @ref HAM_INV_KEY_SIZE if the key size is larger than the + * @a HAM_PARAMETER_KEY_SIZE parameter specified for + * @ref ham_env_create_db + * OR if the key's size is greater than the Btree key size (see + * @ref HAM_PARAM_KEY_SIZE). + * @return @ref HAM_INV_RECORD_SIZE if the record size is different from + * the one specified with @a HAM_PARAM_RECORD_SIZE + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_insert(ham_db_t *db, ham_txn_t *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags); + +/** + * Flag for @ref ham_db_insert and @ref ham_cursor_insert + * + * When specified with @ref ham_db_insert and in case a key + * is specified which stores duplicates in the Database, the first + * duplicate record will be overwritten. + * + * When used with @ref ham_cursor_insert and assuming the same + * conditions, the duplicate currently referenced by the Cursor + * will be overwritten. +*/ +#define HAM_OVERWRITE 0x0001 + +/** Flag for @ref ham_db_insert and @ref ham_cursor_insert */ +#define HAM_DUPLICATE 0x0002 + +/** Flag for @ref ham_cursor_insert */ +#define HAM_DUPLICATE_INSERT_BEFORE 0x0004 + +/** Flag for @ref ham_cursor_insert */ +#define HAM_DUPLICATE_INSERT_AFTER 0x0008 + +/** Flag for @ref ham_cursor_insert */ +#define HAM_DUPLICATE_INSERT_FIRST 0x0010 + +/** Flag for @ref ham_cursor_insert */ +#define HAM_DUPLICATE_INSERT_LAST 0x0020 + +/** Flag for @ref ham_db_find, @ref ham_cursor_find, @ref ham_cursor_move */ +#define HAM_DIRECT_ACCESS 0x0040 + +/** Flag for @ref ham_db_insert, @ref ham_cursor_insert, @ref ham_db_find, + * @ref ham_cursor_find, @ref ham_cursor_move */ +#define HAM_PARTIAL 0x0080 + +/* Internal flag for @ref ham_db_find, @ref ham_cursor_find, + * @ref ham_cursor_move */ +#define HAM_FORCE_DEEP_COPY 0x0100 + +/** + * Flag for @ref ham_cursor_insert + * + * Mutually exclusive with flag @ref HAM_HINT_PREPEND. + * + * Hints the hamsterdb engine that the current key will + * compare as @e larger than any key already existing in the Database. + * The hamsterdb engine will verify this postulation and when found not + * to be true, will revert to a regular insert operation + * as if this flag was not specified. The incurred cost then is only one + * additional key comparison. + */ +#define HAM_HINT_APPEND 0x00080000 + +/** + * Flag for @ref ham_cursor_insert + * + * Mutually exclusive with flag @ref HAM_HINT_APPEND. + * + * Hints the hamsterdb engine that the current key will + * compare as @e smaller than any key already existing in the Database. + * The hamsterdb engine will verify this postulation and when found not + * to be true, will revert to a regular insert operation + * as if this flag was not specified. The incurred cost then is only one + * additional key comparison. + */ +#define HAM_HINT_PREPEND 0x00100000 + +/** + * Flag mask to extract the common hint flags from a find/move/insert/erase + * flag value. + */ +#define HAM_HINTS_MASK 0x001F0000 + +/** + * Erases a Database item + * + * This function erases a Database item. If the item @a key + * does not exist, @ref HAM_KEY_NOT_FOUND is returned. + * + * Note that ham_db_erase can not erase a single duplicate key. If the key + * has multiple duplicates, all duplicates of this key will be erased. Use + * @ref ham_cursor_erase to erase a specific duplicate key. + * + * @param db A valid Database handle + * @param txn A Transaction handle, or NULL + * @param key The key to delete + * @param flags Optional flags for erasing; unused, set to 0 + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db or @a key is NULL + * @return @ref HAM_WRITE_PROTECTED if you tried to erase a key from a read-only + * Database + * @return @ref HAM_KEY_NOT_FOUND if @a key was not found + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_erase(ham_db_t *db, ham_txn_t *txn, ham_key_t *key, uint32_t flags); + +/* internal flag for ham_db_erase() - do not use */ +#define HAM_ERASE_ALL_DUPLICATES 1 + +/** + * Returns the number of keys stored in the Database + * + * You can specify the @ref HAM_SKIP_DUPLICATES if you do now want + * to include any duplicates in the count. This will also speed up the + * counting. + * + * @param db A valid Database handle + * @param txn A Transaction handle, or NULL + * @param flags Optional flags: + * <ul> + * <li>@ref HAM_SKIP_DUPLICATES. Excludes any duplicates from + * the count + * </ul> + * @param keycount A reference to a variable which will receive + * the calculated key count per page + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db or @a keycount is NULL or when + * @a flags contains an invalid flag set + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_get_key_count(ham_db_t *db, ham_txn_t *txn, uint32_t flags, + uint64_t *keycount); + +/** + * Retrieve the current value for a given Database setting + * + * Only those values requested by the parameter array will be stored. + * + * The following parameters are supported: + * <ul> + * <li>HAM_PARAM_FLAGS</li> returns the flags which were used to + * open or create this Database + * <li>HAM_PARAM_DATABASE_NAME</li> returns the Database name + * <li>HAM_PARAM_KEY_TYPE</li> returns the Btree key type + * <li>HAM_PARAM_KEY_SIZE</li> returns the Btree key size + * or @ref HAM_KEY_SIZE_UNLIMITED if there was no fixed key size + * specified. + * <li>HAM_PARAM_RECORD_SIZE</li> returns the record size, + * or @ref HAM_RECORD_SIZE_UNLIMITED if there was no fixed record size + * specified. + * <li>HAM_PARAM_MAX_KEYS_PER_PAGE</li> returns the maximum number + * of keys per page. This number is precise if the key size is fixed + * and duplicates are disabled; otherwise it's an estimate. + * <li>@ref HAM_PARAM_RECORD_COMPRESSION</li> Returns the + * selected algorithm for record compression, or 0 if compression + * is disabled + * <li>@ref HAM_PARAM_KEY_COMPRESSION</li> Returns the + * selected algorithm for key compression, or 0 if compression + * is disabled + * </ul> + * + * @param db A valid Database handle + * @param param An array of ham_parameter_t structures + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if the @a db pointer is NULL or + * @a param is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_get_parameters(ham_db_t *db, ham_parameter_t *param); + +/** Parameter name for @ref ham_env_open, @ref ham_env_create; + * Journal files are switched whenever the number of new Transactions exceeds + * this threshold. */ +#define HAM_PARAM_JOURNAL_SWITCH_THRESHOLD 0x00001 + +/** Parameter name for @ref ham_env_open, @ref ham_env_create; + * sets the cache size */ +#define HAM_PARAM_CACHE_SIZE 0x00000100 +/* deprecated */ +#define HAM_PARAM_CACHESIZE HAM_PARAM_CACHE_SIZE + +/** Parameter name for @ref ham_env_create; sets the page size */ +#define HAM_PARAM_PAGE_SIZE 0x00000101 +/* deprecated */ +#define HAM_PARAM_PAGESIZE HAM_PARAM_PAGE_SIZE + +/** Parameter name for @ref ham_env_create_db; sets the key size */ +#define HAM_PARAM_KEY_SIZE 0x00000102 +/* deprecated */ +#define HAM_PARAM_KEYSIZE HAM_PARAM_KEY_SIZE + +/** Parameter name for @ref ham_env_get_parameters; retrieves the number + * of maximum Databases */ +#define HAM_PARAM_MAX_DATABASES 0x00000103 + +/** Parameter name for @ref ham_env_create_db; sets the key type */ +#define HAM_PARAM_KEY_TYPE 0x00000104 + +/** Parameter name for @ref ham_env_open, @ref ham_env_create; + * sets the path of the log files */ +#define HAM_PARAM_LOG_DIRECTORY 0x00000105 + +/** hamsterdb pro: Parameter name for @ref ham_env_open, @ref ham_env_create; + * sets the AES encryption key */ +#define HAM_PARAM_ENCRYPTION_KEY 0x00000106 + +/** Parameter name for @ref ham_env_open, @ref ham_env_create; + * sets the network timeout (in seconds) */ +#define HAM_PARAM_NETWORK_TIMEOUT_SEC 0x00000107 + +/** Parameter name for @ref ham_env_create_db; sets the key size */ +#define HAM_PARAM_RECORD_SIZE 0x00000108 + +/** Parameter name for @ref ham_env_create, @ref ham_env_open; sets a + * limit for the file size (in bytes) */ +#define HAM_PARAM_FILE_SIZE_LIMIT 0x00000109 + +/** Parameter name for @ref ham_env_create, @ref ham_env_open; sets the + * parameter for posix_fadvise() */ +#define HAM_PARAM_POSIX_FADVISE 0x00000110 + +/** Value for @ref HAM_PARAM_POSIX_FADVISE */ +#define HAM_POSIX_FADVICE_NORMAL 0 + +/** Value for @ref HAM_PARAM_POSIX_FADVISE */ +#define HAM_POSIX_FADVICE_RANDOM 1 + +/** Value for unlimited record sizes */ +#define HAM_RECORD_SIZE_UNLIMITED ((uint32_t)-1) + +/** Value for unlimited key sizes */ +#define HAM_KEY_SIZE_UNLIMITED ((uint16_t)-1) + +/** Retrieves the Database/Environment flags as were specified at the time of + * @ref ham_env_create/@ref ham_env_open invocation. */ +#define HAM_PARAM_FLAGS 0x00000200 + +/** Retrieves the filesystem file access mode as was specified at the time + * of @ref ham_env_create/@ref ham_env_open invocation. */ +#define HAM_PARAM_FILEMODE 0x00000201 + +/** + * Return a <code>const char *</code> pointer to the current + * Environment/Database file name in the @ref uint64_t value + * member, when the Database is actually stored on disc. + * + * In-memory Databases will return a NULL (0) pointer instead. + */ +#define HAM_PARAM_FILENAME 0x00000202 + +/** + * Retrieve the Database 'name' number of this @ref ham_db_t Database within + * the current @ref ham_env_t Environment. +*/ +#define HAM_PARAM_DATABASE_NAME 0x00000203 + +/** + * Retrieve the maximum number of keys per page; this number depends on the + * currently active page and key sizes. Can be an estimate if keys do not + * have constant sizes or if duplicate keys are used. + */ +#define HAM_PARAM_MAX_KEYS_PER_PAGE 0x00000204 + +/** + * hamsterdb pro: Parameter name for @ref ham_env_create, @ref ham_env_open; + * enables compression for the journal. + */ +#define HAM_PARAM_JOURNAL_COMPRESSION 0x00001000 + +/** + * hamsterdb pro: Parameter name for @ref ham_env_create_db, + * @ref ham_env_open_db; enables compression for the records of + * a Database. + */ +#define HAM_PARAM_RECORD_COMPRESSION 0x00001001 + +/** + * hamsterdb pro: Parameter name for @ref ham_env_create_db, + * @ref ham_env_open_db; enables compression for the records of + * a Database. + */ +#define HAM_PARAM_KEY_COMPRESSION 0x00001002 + +/** hamsterdb pro: helper macro for disabling compression */ +#define HAM_COMPRESSOR_NONE 0 + +/** + * hamsterdb pro: selects zlib compression + * http://www.zlib.net/ + */ +#define HAM_COMPRESSOR_ZLIB 1 + +/** + * hamsterdb pro: selects google snappy compression + * http://code.google.com/p/snappy + */ +#define HAM_COMPRESSOR_SNAPPY 2 + +/** + * hamsterdb pro: selects lzf compression + * http://oldhome.schmorp.de/marc/liblzf.html + */ +#define HAM_COMPRESSOR_LZF 3 + +/** + * hamsterdb pro: selects lzo compression + * http://www.oberhumer.com/opensource/lzo + */ +#define HAM_COMPRESSOR_LZO 4 + +/** + * Retrieves the Environment handle of a Database + * + * @param db A valid Database handle + * + * @return The Environment handle + */ +HAM_EXPORT ham_env_t *HAM_CALLCONV +ham_db_get_env(ham_db_t *db); + +/** + * Returns the kind of key match which produced this key as it was + * returned by one of the @ref ham_db_find() and @ref ham_cursor_find(). + * + * This routine assumes the key was passed back by one of the @ref ham_db_find + * and @ref ham_cursor_find functions and not used by any other hamsterdb + * functions after that. + * + * As such, this function produces an answer akin to the 'sign' of the + * specified key as it was returned by the find operation. + * + * @param key A valid key + * + * @return 1 (greater than) or -1 (less than) when the given key is an + * approximate result / zero (0) otherwise. Specifically: + * <ul> + * <li>+1 when the key is greater than the item searched for (key + * was a GT match) + * <li>-1 when the key is less than the item searched for (key was + * a LT match) + * <li>zero (0) otherwise (key was an EQ (EXACT) match) + * </ul> + */ +HAM_EXPORT int HAM_CALLCONV +ham_key_get_approximate_match_type(ham_key_t *key); + +/** + * Closes the Database + * + * This function flushes the Database and then closes the file handle. + * It also free the memory resources allocated in the @a db handle. + * + * If the flag @ref HAM_AUTO_CLEANUP is specified, hamsterdb automatically + * calls @ref ham_cursor_close on all open Cursors. This invalidates the + * ham_cursor_t handle! + * + * If the flag is not specified, the application must close all Database + * Cursors with @ref ham_cursor_close to prevent memory leaks. + * + * This function also aborts all Transactions which were not yet committed, + * and therefore renders all Transaction handles invalid. If the flag + * @ref HAM_TXN_AUTO_COMMIT is specified, all Transactions will be committed. + * + * @param db A valid Database handle + * @param flags Optional flags for closing the Database. Possible values are: + * <ul> + * <li>@ref HAM_AUTO_CLEANUP. Automatically closes all open Cursors + * <li>@ref HAM_TXN_AUTO_COMMIT. Automatically commit all open + * Transactions + * <li>@ref HAM_TXN_AUTO_ABORT. Automatically abort all open + * Transactions; this is the default behaviour + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db is NULL + * @return @ref HAM_CURSOR_STILL_OPEN if not all Cursors of this Database + * were closed, and @ref HAM_AUTO_CLEANUP was not specified + * @return @ref HAM_TXN_STILL_OPEN if this Database is modified by a + * currently active Transaction + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_close(ham_db_t *db, uint32_t flags); + +/** Flag for @ref ham_db_close, @ref ham_env_close */ +#define HAM_AUTO_CLEANUP 1 + +/** @internal (Internal) flag for @ref ham_db_close, @ref ham_env_close */ +#define HAM_DONT_CLEAR_LOG 2 + +/** Automatically abort all open Transactions (the default) */ +#define HAM_TXN_AUTO_ABORT 4 + +/** Automatically commit all open Transactions */ +#define HAM_TXN_AUTO_COMMIT 8 + +/** + * @} + */ + +/** + * @defgroup ham_cursor hamsterdb Cursor Functions + * @{ + */ + +/** + * Creates a Database Cursor + * + * Creates a new Database Cursor. Cursors can be used to + * traverse the Database from start to end or vice versa. Cursors + * can also be used to insert, delete or search Database items. + * + * A newly created Cursor does not point to any item in the Database. + * + * The application should close all Cursors of a Database before closing + * the Database. + * + * If Transactions are enabled (@ref HAM_ENABLE_TRANSACTIONS), but @a txn + * is NULL, then each Cursor operation (i.e. @ref ham_cursor_insert, + * @ref ham_cursor_find etc) will create its own, temporary Transaction + * <b>only</b> for the lifetime of this operation and not for the lifetime + * of the whole Cursor! + * + * @param db A valid Database handle + * @param txn A Transaction handle, or NULL + * @param flags Optional flags for creating the Cursor; unused, set to 0 + * @param cursor A pointer to a pointer which is allocated for the + * new Cursor handle + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a db or @a cursor is NULL + * @return @ref HAM_OUT_OF_MEMORY if the new structure could not be allocated + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_create(ham_cursor_t **cursor, ham_db_t *db, ham_txn_t *txn, + uint32_t flags); + +/** + * Clones a Database Cursor + * + * Clones an existing Cursor. The new Cursor will point to + * exactly the same item as the old Cursor. If the old Cursor did not point + * to any item, so will the new Cursor. + * + * If the old Cursor is bound to a Transaction, then the new Cursor will + * also be bound to this Transaction. + * + * @param src The existing Cursor + * @param dest A pointer to a pointer, which is allocated for the + * cloned Cursor handle + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a src or @a dest is NULL + * @return @ref HAM_OUT_OF_MEMORY if the new structure could not be allocated + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_clone(ham_cursor_t *src, ham_cursor_t **dest); + +/** + * Moves the Cursor + * + * Moves the Cursor. Use the @a flags to specify the direction. + * After the move, key and record of the item are returned, if @a key + * and/or @a record are valid pointers. + * + * If the direction is not specified, the Cursor will not move. Do not + * specify a direction if you want to fetch the key and/or record of + * the current item. + * + * When specifying @ref HAM_DIRECT_ACCESS, the @a data pointer will point + * directly to the record that is stored in hamsterdb; the data can be modified, + * but the pointer must not be reallocated or freed. The flag @ref + * HAM_DIRECT_ACCESS is only allowed in In-Memory Databases and not if + * Transactions are enabled. + * + * You can read only portions of the record by specifying the flag + * @ref HAM_PARTIAL. In this case, hamsterdb will read + * <b>record->partial_size</b> bytes of the record data at offset + * <b>record->partial_offset</b>. If necessary, the record data will + * be limited to the original record size. The number of actually read + * bytes is returned in <b>record->partial_size</b>. The original size of + * the record is stored in <b>record->size</b>. + * + * @ref HAM_PARTIAL is not allowed if record->size is <= 8 or if Transactions + * are enabled. In such a case, @ref HAM_INV_PARAMETER is returned. + * + * If Transactions are enabled (see @ref HAM_ENABLE_TRANSACTIONS), and + * the Cursor moves next or previous to a key which is currently modified + * in an active Transaction (one that is not yet committed or aborted), then + * hamsterdb will skip the modified key. (This behavior is different from i.e. + * @a ham_cursor_find, which would return the error @ref HAM_TXN_CONFLICT). + * + * If a key has duplicates and any of the duplicates is currently modified + * in another active Transaction, then ALL duplicate keys are skipped when + * moving to the next or previous key. + * + * If the first (@ref HAM_CURSOR_FIRST) or last (@ref HAM_CURSOR_LAST) key + * is requested, and the current key (or any of its duplicates) is currently + * modified in an active Transaction, then @ref HAM_TXN_CONFLICT is + * returned. + * + * If this Cursor is nil (i.e. because it was not yet used or the Cursor's + * item was erased) then the flag @a HAM_CURSOR_NEXT (or @a + * HAM_CURSOR_PREVIOUS) will be identical to @a HAM_CURSOR_FIRST (or + * @a HAM_CURSOR_LAST). + * + * @param cursor A valid Cursor handle + * @param key An optional pointer to a @ref ham_key_t structure. If this + * pointer is not NULL, the key of the new item is returned. + * Note that key->data will point to temporary data. This pointer + * will be invalidated by subsequent hamsterdb API calls. See + * @ref HAM_KEY_USER_ALLOC on how to change this behaviour. + * @param record An optional pointer to a @ref ham_record_t structure. If this + * pointer is not NULL, the record of the new item is returned. + * Note that record->data will point to temporary data. This pointer + * will be invalidated by subsequent hamsterdb API calls. See + * @ref HAM_RECORD_USER_ALLOC on how to change this behaviour. + * @param flags The flags for this operation. They are used to specify + * the direction for the "move". If you do not specify a direction, + * the Cursor will remain on the current position. + * <ul> + * <li>@ref HAM_CURSOR_FIRST </li> positions the Cursor on the first + * item in the Database + * <li>@ref HAM_CURSOR_LAST </li> positions the Cursor on the last + * item in the Database + * <li>@ref HAM_CURSOR_NEXT </li> positions the Cursor on the next + * item in the Database; if the Cursor does not point to any + * item, the function behaves as if direction was + * @ref HAM_CURSOR_FIRST. + * <li>@ref HAM_CURSOR_PREVIOUS </li> positions the Cursor on the + * previous item in the Database; if the Cursor does not point to + * any item, the function behaves as if direction was + * @ref HAM_CURSOR_LAST. + * <li>@ref HAM_SKIP_DUPLICATES </li> skips duplicate keys of the + * current key. Not allowed in combination with + * @ref HAM_ONLY_DUPLICATES. + * <li>@ref HAM_ONLY_DUPLICATES </li> only move through duplicate keys + * of the current key. Not allowed in combination with + * @ref HAM_SKIP_DUPLICATES. + * <li>@ref HAM_DIRECT_ACCESS </li> Only for In-Memory Databases and + * not if Transactions are enabled! + * Returns a direct pointer to the data blob stored by the + * hamsterdb engine. This pointer must not be resized or freed, + * but the data in this memory can be modified. + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a cursor is NULL, or if an invalid + * combination of flags was specified + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is set but record + * size is <= 8 or Transactions are enabled + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item, but + * key and/or record were requested + * @return @ref HAM_KEY_NOT_FOUND if @a cursor points to the first (or last) + * item, and a move to the previous (or next) item was + * requested + * @return @ref HAM_INV_PARAMETER if @a HAM_DIRECT_ACCESS is specified, + * but the Database is not an In-Memory Database. + * @return @ref HAM_INV_PARAMETER if @a HAM_DIRECT_ACCESS and + * @a HAM_ENABLE_TRANSACTIONS were both specified. + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is specified and + * record->partial_offset+record->partial_size exceeds the + * record->size + * @return @ref HAM_TXN_CONFLICT if @ref HAM_CURSOR_FIRST or @ref + * HAM_CURSOR_LAST is specified but the first (or last) key or + * any of its duplicates is currently modified in an active + * Transaction + * + * @sa HAM_RECORD_USER_ALLOC + * @sa HAM_KEY_USER_ALLOC + * @sa ham_record_t + * @sa ham_key_t + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_move(ham_cursor_t *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags); + +/** Flag for @ref ham_cursor_move */ +#define HAM_CURSOR_FIRST 0x0001 + +/** Flag for @ref ham_cursor_move */ +#define HAM_CURSOR_LAST 0x0002 + +/** Flag for @ref ham_cursor_move */ +#define HAM_CURSOR_NEXT 0x0004 + +/** Flag for @ref ham_cursor_move */ +#define HAM_CURSOR_PREVIOUS 0x0008 + +/** Flag for @ref ham_cursor_move and @ref ham_db_get_key_count */ +#define HAM_SKIP_DUPLICATES 0x0010 + +/** Flag for @ref ham_cursor_move */ +#define HAM_ONLY_DUPLICATES 0x0020 + +/** + * Overwrites the current record + * + * This function overwrites the record of the current item. + * + * @param cursor A valid Cursor handle + * @param record A valid record structure + * @param flags Optional flags for overwriting the item; unused, set to 0 + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a cursor or @a record is NULL + * @return @ref HAM_INV_PARAMETER if @a cursor points to an item with + * duplicates and duplicate sorting is enabled + * @return @ref HAM_INV_PARAMETER if duplicate sorting is enabled + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_overwrite(ham_cursor_t *cursor, ham_record_t *record, + uint32_t flags); + +/** + * Searches with a key and points the Cursor to the key found, retrieves + * the located record + * + * Searches for an item in the Database and points the Cursor to this item. + * If the item could not be found, the Cursor is not modified. + * + * Note that @ref ham_cursor_find can not search for duplicate keys. If @a key + * has multiple duplicates, only the first duplicate is returned. + * + * When specifying @ref HAM_DIRECT_ACCESS, the @a data pointer will point + * directly to the record that is stored in hamsterdb; the data can be modified, + * but the pointer must not be reallocated or freed. The flag @ref + * HAM_DIRECT_ACCESS is only allowed in In-Memory Databases and not if + * Transactions are enabled. + * + * You can read only portions of the record by specifying the flag + * @ref HAM_PARTIAL. In this case, hamsterdb will read + * <b>record->partial_size</b> bytes of the record data at offset + * <b>record->partial_offset</b>. If necessary, the record data will + * be limited to the original record size. The number of actually read + * bytes is returned in <b>record->partial_size</b>. The original size of + * the record is stored in <b>record->size</b>. + * + * @ref HAM_PARTIAL is not allowed if record->size is <= 8 or if Transactions + * are enabled. In such a case, @ref HAM_INV_PARAMETER is returned. + * + * When either or both @ref HAM_FIND_LT_MATCH and/or @ref HAM_FIND_GT_MATCH + * have been specified as flags, the @a key structure will be overwritten + * when an approximate match was found: the @a key and @a record + * structures will then point at the located @a key (and @a record). + * In this case the caller should ensure @a key points at a structure + * which must adhere to the same restrictions and conditions as specified + * for @ref ham_cursor_move(...,HAM_CURSOR_*): + * key->data will point to temporary data upon return. This pointer + * will be invalidated by subsequent hamsterdb API calls using the same + * Transaction (or the same Database, if Transactions are disabled). See + * @ref HAM_KEY_USER_ALLOC on how to change this behaviour. + * + * Further note that the @a key structure must be non-const at all times as its + * internal flag bits may be written to. This is done for your benefit, as + * you may pass the returned @a key structure to + * @ref ham_key_get_approximate_match_type() to retrieve additional info about + * the precise nature of the returned key: the sign value produced + * by @ref ham_key_get_approximate_match_type() tells you which kind of match + * (equal, less than, greater than) occurred. This is very useful to + * discern between the various possible successful answers produced by the + * combinations of @ref HAM_FIND_LT_MATCH and @ref HAM_FIND_GT_MATCH. + * + * @param cursor A valid Cursor handle + * @param key A pointer to a @ref ham_key_t structure. If this + * pointer is not NULL, the key of the new item is returned. + * Note that key->data will point to temporary data. This pointer + * will be invalidated by subsequent hamsterdb API calls. See + * @a HAM_KEY_USER_ALLOC on how to change this behaviour. + * @param record Optional pointer to a @ref ham_record_t structure. If this + * pointer is not NULL, the record of the new item is returned. + * Note that record->data will point to temporary data. This pointer + * will be invalidated by subsequent hamsterdb API calls. See + * @ref HAM_RECORD_USER_ALLOC on how to change this behaviour. + * @param flags Optional flags for searching, which can be combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_FIND_LT_MATCH </li> Cursor 'find' flag 'Less Than': the + * cursor is moved to point at the last record which' key + * is less than the specified key. When such a record cannot + * be located, an error is returned. + * <li>@ref HAM_FIND_GT_MATCH </li> Cursor 'find' flag 'Greater Than': + * the cursor is moved to point at the first record which' key is + * larger than the specified key. When such a record cannot be + * located, an error is returned. + * <li>@ref HAM_FIND_LEQ_MATCH </li> Cursor 'find' flag 'Less or EQual': + * the cursor is moved to point at the record which' key matches + * the specified key and when such a record is not available + * the cursor is moved to point at the last record which' key + * is less than the specified key. When such a record cannot be + * located, an error is returned. + * <li>@ref HAM_FIND_GEQ_MATCH </li> Cursor 'find' flag 'Greater or + * Equal': the cursor is moved to point at the record which' key + * matches the specified key and when such a record + * is not available the cursor is moved to point at the first + * record which' key is larger than the specified key. + * When such a record cannot be located, an error is returned. + * <li>@ref HAM_FIND_NEAR_MATCH </li> Cursor 'find' flag 'Any Near Or + * Equal': the cursor is moved to point at the record which' + * key matches the specified key and when such a record is + * not available the cursor is moved to point at either the + * last record which' key is less than the specified key or + * the first record which' key is larger than the specified + * key, whichever of these records is located first. + * When such records cannot be located, an error is returned. + * <li>@ref HAM_DIRECT_ACCESS </li> Only for In-Memory Databases and + * not if Transactions are enabled! + * Returns a direct pointer to the data blob stored by the + * hamsterdb engine. This pointer must not be resized or freed, + * but the data in this memory can be modified. + * </ul> + * + * <b>Remark</b> + * For Approximate Matching the returned match will either match the + * key exactly or is either the first key available above or below the + * given key when an exact match could not be found; 'find' does NOT + * spend any effort, in the sense of determining which of both is the + * 'nearest' to the given key, when both a key above and a key below the + * one given exist; 'find' will simply return the first of both found. + * As such, this flag is the simplest possible combination of the + * combined @ref HAM_FIND_LEQ_MATCH and @ref HAM_FIND_GEQ_MATCH flags. + * + * Note that these flags may be bitwise OR-ed to form functional combinations. + * + * @ref HAM_FIND_LEQ_MATCH, @ref HAM_FIND_GEQ_MATCH and + * @ref HAM_FIND_LT_MATCH, @ref HAM_FIND_GT_MATCH + * + * @return @ref HAM_SUCCESS upon success. Mind the remarks about the + * @a key flags being adjusted and the useful invocation of + * @ref ham_key_get_approximate_match_type() afterwards. + * @return @ref HAM_INV_PARAMETER if @a db, @a key or @a record is NULL + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_KEY_NOT_FOUND if no suitable @a key (record) exists + * @return @ref HAM_INV_PARAMETER if @a HAM_DIRECT_ACCESS is specified, + * but the Database is not an In-Memory Database. + * @return @ref HAM_INV_PARAMETER if @a HAM_DIRECT_ACCESS and + * @a HAM_ENABLE_TRANSACTIONS were both specified. + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is set but record + * size is <= 8 or Transactions are enabled + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + * + * @sa HAM_KEY_USER_ALLOC + * @sa ham_key_t + * @sa HAM_RECORD_USER_ALLOC + * @sa ham_record_t + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_find(ham_cursor_t *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags); + +/* internal flag */ +#define HAM_FIND_EXACT_MATCH 0x4000 + +/** + * Cursor 'find' flag 'Less Than': return the nearest match below the + * given key, whether an exact match exists or not. + */ +#define HAM_FIND_LT_MATCH 0x1000 + +/** + * Cursor 'find' flag 'Greater Than': return the nearest match above the + * given key, whether an exact match exists or not. + */ +#define HAM_FIND_GT_MATCH 0x2000 + +/** + * Cursor 'find' flag 'Less or EQual': return the nearest match below the + * given key, when an exact match does not exist. + * + * May be combined with @ref HAM_FIND_GEQ_MATCH to accept any 'near' key, or + * you can use the @ref HAM_FIND_NEAR_MATCH constant as a shorthand for that. + */ +#define HAM_FIND_LEQ_MATCH (HAM_FIND_LT_MATCH | HAM_FIND_EXACT_MATCH) + +/** + * Cursor 'find' flag 'Greater or Equal': return the nearest match above + * the given key, when an exact match does not exist. + * + * May be combined with @ref HAM_FIND_LEQ_MATCH to accept any 'near' key, + * or you can use the @ref HAM_FIND_NEAR_MATCH constant as a shorthand for that. + */ +#define HAM_FIND_GEQ_MATCH (HAM_FIND_GT_MATCH | HAM_FIND_EXACT_MATCH) + +/** + * Cursor 'find' flag 'Any Near Or Equal': return a match directly below or + * above the given key, when an exact match does not exist. + * + * Be aware that the returned match will either match the key exactly or + * is either the first key available above or below the given key when an + * exact match could not be found; 'find' does NOT spend any effort, in the + * sense of determining which of both is the 'nearest' to the given key, + * when both a key above and a key below the one given exist; 'find' will + * simply return the first of both found. As such, this flag is the simplest + * possible combination of the combined @ref HAM_FIND_LEQ_MATCH and + * @ref HAM_FIND_GEQ_MATCH flags. + */ +#define HAM_FIND_NEAR_MATCH (HAM_FIND_LT_MATCH | HAM_FIND_GT_MATCH \ + | HAM_FIND_EXACT_MATCH) + +/** + * Inserts a Database item and points the Cursor to the inserted item + * + * This function inserts a key/record pair as a new Database item. + * If the key already exists in the Database, error @ref HAM_DUPLICATE_KEY + * is returned. + * + * If you wish to overwrite an existing entry specify the + * flag @ref HAM_OVERWRITE. The use of this flag is not allowed in combination + * with @ref HAM_DUPLICATE. + * + * If you wish to insert a duplicate key specify the flag @ref HAM_DUPLICATE. + * (In order to use duplicate keys, the Database has to be created with + * @ref HAM_ENABLE_DUPLICATE_KEYS.) + * By default, the duplicate key is inserted after all other duplicate keys + * (see @ref HAM_DUPLICATE_INSERT_LAST). This behaviour can be overwritten by + * specifying @ref HAM_DUPLICATE_INSERT_FIRST, @ref HAM_DUPLICATE_INSERT_BEFORE + * or @ref HAM_DUPLICATE_INSERT_AFTER. + * + * You can write only portions of the record by specifying the flag + * @ref HAM_PARTIAL. In this case, hamsterdb will write <b>partial_size</b> + * bytes of the record data at offset <b>partial_offset</b>. If necessary, the + * record data will grow. Gaps will be filled with null-bytes, if the record + * did not yet exist. + * + * @ref HAM_PARTIAL is not allowed if record->size is <= 8 or if Transactions + * are enabled. In such a case, @ref HAM_INV_PARAMETER is returned. + * + * Specify the flag @ref HAM_HINT_APPEND if you insert sequential data + * and the current @a key is greater than any other key in this Database. + * In this case hamsterdb will optimize the insert algorithm. hamsterdb will + * verify that this key is the greatest; if not, it will perform a normal + * insert. This flag is the default for Record Number Databases. + * + * Specify the flag @ref HAM_HINT_PREPEND if you insert sequential data + * and the current @a key is lower than any other key in this Database. + * In this case hamsterdb will optimize the insert algorithm. hamsterdb will + * verify that this key is the lowest; if not, it will perform a normal + * insert. + * + * After inserting, the Cursor will point to the new item. If inserting + * the item failed, the Cursor is not modified. + * + * Record Number Databases (created with @ref HAM_RECORD_NUMBER32 or + * @ref HAM_RECORD_NUMBER64) expect either an empty @a key (with a size of + * 0 and data pointing to NULL), or a user-supplied key (with key.flag + * @ref HAM_KEY_USER_ALLOC and a valid data pointer). + * If key.size is 0 and key.data is NULL, hamsterdb will temporarily + * allocate memory for key->data, which will then point to an 4-byte (or 8-byte) + * unsigned integer. + * + * @param cursor A valid Cursor handle + * @param key A valid key structure + * @param record A valid record structure + * @param flags Optional flags for inserting the item, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_OVERWRITE. If the @a key already exists, the record is + * overwritten. Otherwise, the key is inserted. Not allowed in + * combination with @ref HAM_DUPLICATE. + * <li>@ref HAM_DUPLICATE. If the @a key already exists, a duplicate + * key is inserted. Same as @ref HAM_DUPLICATE_INSERT_LAST. Not + * allowed in combination with @ref HAM_DUPLICATE. + * <li>@ref HAM_DUPLICATE_INSERT_BEFORE. If the @a key already exists, + * a duplicate key is inserted before the duplicate pointed + * to by the Cursor. Not allowed if duplicate sorting is enabled. + * <li>@ref HAM_DUPLICATE_INSERT_AFTER. If the @a key already exists, + * a duplicate key is inserted after the duplicate pointed + * to by the Cursor. Not allowed if duplicate sorting is enabled. + * <li>@ref HAM_DUPLICATE_INSERT_FIRST. If the @a key already exists, + * a duplicate key is inserted as the first duplicate of + * the current key. Not allowed if duplicate sorting is enabled. + * <li>@ref HAM_DUPLICATE_INSERT_LAST. If the @a key already exists, + * a duplicate key is inserted as the last duplicate of + * the current key. Not allowed if duplicate sorting is enabled. + * <li>@ref HAM_HINT_APPEND. Hints the hamsterdb engine that the + * current key will compare as @e larger than any key already + * existing in the Database. The hamsterdb engine will verify + * this postulation and when found not to be true, will revert + * to a regular insert operation as if this flag was not + * specified. The incurred cost then is only one additional key + * comparison. Mutually exclusive with flag @ref HAM_HINT_PREPEND. + * This is the default for Record Number Databases. + * <li>@ref HAM_HINT_PREPEND. Hints the hamsterdb engine that the + * current key will compare as @e lower than any key already + * existing in the Database. The hamsterdb engine will verify + * this postulation and when found not to be true, will revert + * to a regular insert operation as if this flag was not + * specified. The incurred cost then is only one additional key + * comparison. Mutually exclusive with flag @ref HAM_HINT_APPEND. + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a key or @a record is NULL + * @return @ref HAM_INV_PARAMETER if the Database is a Record Number Database + * and the key is invalid (see above) + * @return @ref HAM_INV_PARAMETER if @ref HAM_PARTIAL is set but record + * size is <= 8 or Transactions are enabled + * @return @ref HAM_INV_PARAMETER if the flags @ref HAM_OVERWRITE <b>and</b> + * @ref HAM_DUPLICATE were specified, or if @ref HAM_DUPLICATE + * was specified, but the Database was not created with + * flag @ref HAM_ENABLE_DUPLICATE_KEYS. + * @return @ref HAM_WRITE_PROTECTED if you tried to insert a key to a read-only + * Database. + * @return @ref HAM_INV_KEY_SIZE if the key size is different from + * the one specified with @a HAM_PARAM_KEY_SIZE + * @return @ref HAM_INV_RECORD_SIZE if the record size is different from + * the one specified with @a HAM_PARAM_RECORD_SIZE + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_insert(ham_cursor_t *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags); + +/** + * Erases the current key + * + * Erases a key from the Database. If the erase was + * successful, the Cursor is invalidated and does no longer point to + * any item. In case of an error, the Cursor is not modified. + * + * If the Database was opened with the flag @ref HAM_ENABLE_DUPLICATE_KEYS, + * this function erases only the duplicate item to which the Cursor refers. + * + * @param cursor A valid Cursor handle + * @param flags Unused, set to 0 + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if @a cursor is NULL + * @return @ref HAM_WRITE_PROTECTED if you tried to erase a key from a read-only + * Database + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_erase(ham_cursor_t *cursor, uint32_t flags); + +/** + * Returns the number of duplicate keys + * + * Returns the number of duplicate keys of the item to which the + * Cursor currently refers. + * Returns 1 if the key has no duplicates. + * + * @param cursor A valid Cursor handle + * @param count Returns the number of duplicate keys + * @param flags Optional flags; unused, set to 0. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_INV_PARAMETER if @a cursor or @a count is NULL + * @return @ref HAM_TXN_CONFLICT if the same key was inserted in another + * Transaction which was not yet committed or aborted + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_get_duplicate_count(ham_cursor_t *cursor, + uint32_t *count, uint32_t flags); + +/** + * Returns the current cursor position in the duplicate list + * + * Returns the position in the duplicate list of the current key. The position + * is 0-based. + * + * @param cursor A valid Cursor handle + * @param position Returns the duplicate position + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_INV_PARAMETER if @a cursor or @a position is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_get_duplicate_position(ham_cursor_t *cursor, + uint32_t *position); + +/** + * Returns the record size of the current key + * + * Returns the record size of the item to which the Cursor currently refers. + * + * @param cursor A valid Cursor handle + * @param size Returns the record size, in bytes + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_INV_PARAMETER if @a cursor or @a size is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_get_record_size(ham_cursor_t *cursor, uint64_t *size); + +/** + * Closes a Database Cursor + * + * Closes a Cursor and frees allocated memory. All Cursors + * should be closed before closing the Database (see @ref ham_db_close). + * + * @param cursor A valid Cursor handle + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_CURSOR_IS_NIL if the Cursor does not point to an item + * @return @ref HAM_INV_PARAMETER if @a cursor is NULL + * + * @sa ham_db_close + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_close(ham_cursor_t *cursor); + +/** + * @} + */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* HAM_HAMSTERDB_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb.hpp b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb.hpp new file mode 100644 index 0000000000..68892ac2d7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb.hpp @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hamsterdb.hpp + * @author Christoph Rupp, chris@crupp.de + * @version 2.1.10 + * + * This C++ wrapper class is a very tight wrapper around the C API. It does + * not attempt to be STL compatible. + * + * All functions throw exceptions of class @sa ham::error in case of an error. + * Please refer to the C API documentation for more information. You can find + * it here: http://hamsterdb.com/?page=doxygen&module=globals.html + * + */ + +#ifndef HAM_HAMSTERDB_HPP +#define HAM_HAMSTERDB_HPP + +#include <ham/hamsterdb.h> +#include <ham/hamsterdb_int.h> +#include <cstring> +#include <vector> + +#if defined(_MSC_VER) && defined(_DEBUG) && !defined(_CRTDBG_MAP_ALLOC) +# define _CRTDBG_MAP_ALLOC +# include <crtdbg.h> +#endif + +/** + * @defgroup ham_cpp hamsterdb C++ API wrapper + * @{ + */ + +/** + * The global hamsterdb namespace. + */ +namespace hamsterdb { + +class txn; +class db; +class env; + +/** + * An error class. + * + * The hamsterdb C++ API throws this class as Exceptions. + */ +class error { + public: + /** Constructor */ + error(ham_status_t st) + : m_errno(st) { + }; + + /** Returns the error code. */ + ham_status_t get_errno() const { + return (m_errno); + } + + /** Returns an English error description. */ + const char *get_string() const { + return (ham_strerror(m_errno)); + } + +private: + ham_status_t m_errno; +}; + +/** + * A key class. + * + * This class wraps structures of type ham_key_t. + */ +class key { + public: + /** Constructor */ + key(void *data = 0, uint16_t size = 0, uint32_t flags = 0) { + memset(&m_key, 0, sizeof(m_key)); + m_key.data = data; + m_key.size = size; + m_key.flags = flags; + if (m_key.size != size) // check for overflow + throw error(HAM_INV_KEYSIZE); + } + + /** Copy constructor. */ + key(const key &other) + : m_key(other.m_key) { + } + + /** Assignment operator. */ + key &operator=(const key &other) { + if (&other != this) + m_key = other.m_key; + return (*this); + } + + /** Returns the key data. */ + void *get_data() const { + return (m_key.data); + } + + /** Sets the key data. */ + void set_data(void *data) { + m_key.data = data; + } + + /** Returns the size of the key. */ + uint16_t get_size() const { + return (m_key.size); + } + + /** Sets the size of the key. */ + void set_size(uint16_t size) { + m_key.size = size; + } + + /** Template assignment */ + template <class T> + void set(T &t) { + set_data(&t); + set_size(sizeof(t)); + } + + /** Returns the flags of the key. */ + uint32_t get_flags() const { + return (m_key.flags); + } + + /** Sets the flags of the key. */ + void set_flags(uint32_t flags) { + m_key.flags = flags; + } + + /** Returns a pointer to the internal ham_key_t structure. */ + ham_key_t *get_handle() { + return (&m_key); + } + + /** Returns 'sign' of Approximate Match */ + int get_approximate_match_type() { + return (ham_key_get_approximate_match_type(&m_key)); + } + +private: + ham_key_t m_key; +}; + +/** + * A record class. + * + * This class wraps structures of type ham_record_t. + */ +class record { + public: + /** Constructor */ + record(void *data = 0, uint32_t size = 0, uint32_t flags = 0) { + memset(&m_rec, 0, sizeof(m_rec)); + m_rec.data = data; + m_rec.size = size; + m_rec.flags = flags; + } + + /** Copy constructor. */ + record(const record &other) + : m_rec(other.m_rec) { + } + + /** Assignment operator. */ + record &operator=(const record &other) { + m_rec = other.m_rec; + return (*this); + } + + /** Returns the record data. */ + void *get_data() const { + return (m_rec.data); + } + + /** Sets the record data. */ + void set_data(void *data) { + m_rec.data = data; + } + + /** Returns the size of the record. */ + uint32_t get_size() const { + return (m_rec.size); + } + + /** Sets the size of the record. */ + void set_size(uint32_t size) { + m_rec.size = size; + } + + /** Returns the flags of the record. */ + uint32_t get_flags() const { + return (m_rec.flags); + } + + /** Sets the flags of the record. */ + void set_flags(uint32_t flags) { + m_rec.flags = flags; + } + + /** Returns a pointer to the internal ham_record_t structure. */ + ham_record_t *get_handle() { + return (&m_rec); + } + + protected: + ham_record_t m_rec; +}; + + +/** + * A Transaction class + * + * This class wraps structures of type ham_txn_t. + */ +class txn { + public: + /** Constructor */ + txn(ham_txn_t *t = 0) + : m_txn(t) { + } + + /** Abort the Transaction */ + void abort() { + ham_status_t st = ham_txn_abort(m_txn, 0); + if (st) + throw error(st); + } + + /** Commit the Transaction */ + void commit() { + ham_status_t st = ham_txn_commit(m_txn, 0); + if (st) + throw error(st); + } + + std::string get_name() { + const char *p = ham_txn_get_name(m_txn); + return (p ? p : ""); + } + + /** Returns a pointer to the internal ham_txn_t structure. */ + ham_txn_t *get_handle() { + return (m_txn); + } + + protected: + ham_txn_t *m_txn; +}; + + +/** + * A Database class. + * + * This class wraps the ham_db_t Database handles. + */ +class db { + public: + /** Set error handler function. */ + static void set_errhandler(ham_errhandler_fun f) { + ham_set_errhandler(f); + } + + /** Retrieves the hamsterdb library version. */ + static void get_version(uint32_t *major, uint32_t *minor, + uint32_t *revision) { + ham_get_version(major, minor, revision); + } + + /** Constructor */ + db() + : m_db(0) { + } + + /** Destructor - automatically closes the Database, if necessary. */ + ~db() { + close(); + } + + /** + * Assignment operator. + * + * <b>Important!</b> This operator transfers the ownership of the + * Database handle. + */ + db &operator=(const db &other) { + db &rhs = (db &)other; + if (this == &other) + return (*this); + close(); + m_db = rhs.m_db; + rhs.m_db = 0; + return (*this); + } + + /** Returns the last Database error. */ + ham_status_t get_error() { + return (ham_db_get_error(m_db)); + } + + /** Sets the comparison function. */ + void set_compare_func(ham_compare_func_t foo) { + ham_status_t st = ham_db_set_compare_func(m_db, foo); + if (st) + throw error(st); + } + + /** Finds a record by looking up the key. */ + record find(txn *t, key *k, uint32_t flags = 0) { + record r; + ham_status_t st = ham_db_find(m_db, + t ? t->get_handle() : 0, + k ? k->get_handle() : 0, + r.get_handle(), flags); + if (st) + throw error(st); + return (r); + } + + /** Finds a record by looking up the key. */ + record &find(txn *t, key *k, record *r, uint32_t flags = 0) { + ham_status_t st = ham_db_find(m_db, + t ? t->get_handle() : 0, + k ? k->get_handle() : 0, + r->get_handle(), flags); + if (st) + throw error(st); + return (*r); + } + + /** Finds a record by looking up the key. */ + record find(key *k, uint32_t flags = 0) { + return (find(0, k, flags)); + } + + /** Inserts a key/record pair. */ + void insert(txn *t, key *k, record *r, uint32_t flags = 0) { + ham_status_t st = ham_db_insert(m_db, + t ? t->get_handle() : 0, + k ? k->get_handle() : 0, + r ? r->get_handle() : 0, flags); + if (st) + throw error(st); + } + + /** Inserts a key/record pair. */ + void insert(key *k, record *r, uint32_t flags=0) { + insert(0, k, r, flags); + } + + /** Erases a key/record pair. */ + void erase(key *k, uint32_t flags = 0) { + erase(0, k, flags); + } + + /** Erases a key/record pair. */ + void erase(txn *t, key *k, uint32_t flags = 0) { + ham_status_t st = ham_db_erase(m_db, + t ? t->get_handle() : 0, + k ? k->get_handle() : 0, flags); + if (st) + throw error(st); + } + + /** Returns number of items in the Database. */ + uint64_t get_key_count(ham_txn_t *txn = 0, uint32_t flags = 0) { + uint64_t count = 0; + ham_status_t st = ham_db_get_key_count(m_db, txn, flags, &count); + if (st) + throw error(st); + return (count); + } + + /** Retrieves Database parameters. */ + void get_parameters(ham_parameter_t *param) { + ham_status_t st = ham_db_get_parameters(m_db, param); + if (st) + throw error(st); + } + + /** Closes the Database. */ + void close(uint32_t flags = 0) { + if (!m_db) + return; + // disable auto-cleanup; all objects will be destroyed when + // going out of scope + flags &= ~HAM_AUTO_CLEANUP; + ham_status_t st = ham_db_close(m_db, flags); + if (st) + throw error(st); + m_db = 0; + } + + /** Returns a pointer to the internal ham_db_t structure. */ + ham_db_t *get_handle() { + return (m_db); + } + +protected: + friend class env; + + /* Copy Constructor. Is protected and should not be used. */ + db(ham_db_t *db) + : m_db(db) { + } + + private: + ham_db_t *m_db; +}; + + +/** + * A Database Cursor. + * + * This class wraps the ham_cursor_t Cursor handles. + */ +class cursor { + public: + /** Constructor */ + cursor(db *db = 0, txn *t = 0, uint32_t flags = 0) + : m_cursor(0) { + create(db, t, flags); + } + + /** Constructor */ + cursor(txn *t, db *db = 0, uint32_t flags = 0) + : m_cursor(0) { + create(db, t, flags); + } + + /** Destructor - automatically closes the Cursor, if necessary. */ + ~cursor() { + close(); + } + + /** Creates a new Cursor. */ + void create(db *db, txn *t = 0, uint32_t flags = 0) { + if (m_cursor) + close(); + if (db) { + ham_status_t st = ham_cursor_create(&m_cursor, db->get_handle(), + t ? t->get_handle() : 0, flags); + if (st) + throw error(st); + } + } + + /** Clones the Cursor. */ + cursor clone() { + ham_cursor_t *dest; + ham_status_t st = ham_cursor_clone(m_cursor, &dest); + if (st) + throw error(st); + return (cursor(dest)); + } + + /** Moves the Cursor, and retrieves the key/record of the new position. */ + void move(key *k, record *r, uint32_t flags = 0) { + ham_status_t st = ham_cursor_move(m_cursor, k ? k->get_handle() : 0, + r ? r->get_handle() : 0, flags); + if (st) + throw error(st); + } + + /** Moves the Cursor to the first Database element. */ + void move_first(key *k = 0, record *r = 0) { + move(k, r, HAM_CURSOR_FIRST); + } + + /** Moves the Cursor to the last Database element. */ + void move_last(key *k = 0, record *r = 0) { + move(k, r, HAM_CURSOR_LAST); + } + + /** Moves the Cursor to the next Database element. */ + void move_next(key *k = 0, record *r = 0) { + move(k, r, HAM_CURSOR_NEXT); + } + + /** Moves the Cursor to the previous Database element. */ + void move_previous(key *k = 0, record *r = 0) { + move(k, r, HAM_CURSOR_PREVIOUS); + } + + /** Overwrites the current record. */ + void overwrite(record *r, uint32_t flags = 0) { + ham_status_t st = ham_cursor_overwrite(m_cursor, + r ? r->get_handle() : 0, flags); + if (st) + throw error(st); + } + + /** Finds a key. */ + void find(key *k, record *r = 0, uint32_t flags = 0) { + ham_status_t st = ham_cursor_find(m_cursor, k->get_handle(), + (r ? r->get_handle() : 0), flags); + if (st) + throw error(st); + } + + /** Inserts a key/record pair. */ + void insert(key *k, record *r, uint32_t flags = 0) { + ham_status_t st = ham_cursor_insert(m_cursor, k ? k->get_handle() : 0, + r ? r->get_handle() : 0, flags); + if (st) + throw error(st); + } + + /** Erases the current key/record pair. */ + void erase(uint32_t flags = 0) { + ham_status_t st = ham_cursor_erase(m_cursor, flags); + if (st) + throw error(st); + } + + /** Returns the number of duplicate keys. */ + uint32_t get_duplicate_count(uint32_t flags = 0) { + uint32_t c; + ham_status_t st = ham_cursor_get_duplicate_count(m_cursor, &c, flags); + if (st) + throw error(st); + return (c); + } + + /** Returns the size of the current record. */ + uint64_t get_record_size() { + uint64_t s; + ham_status_t st = ham_cursor_get_record_size(m_cursor, &s); + if (st) + throw error(st); + return (s); + } + + /** Closes the Cursor. */ + void close() { + if (!m_cursor) + return; + ham_status_t st = ham_cursor_close(m_cursor); + if (st) + throw error(st); + m_cursor = 0; + } + + protected: + /* Copy Constructor. Is protected and should not be used. */ + cursor(ham_cursor_t *c) { + m_cursor = c; + } + + private: + ham_cursor_t *m_cursor; +}; + +/** + * An Environment class. + * + * This class wraps the ham_env_t structure. + */ +class env { + public: + /** Constructor */ + env() + : m_env(0) { + } + + /** Destructor - automatically closes the Cursor, if necessary. */ + ~env() { + close(); + } + + /** Creates a new Environment. */ + void create(const char *filename, uint32_t flags = 0, + uint32_t mode = 0644, const ham_parameter_t *param = 0) { + ham_status_t st = ham_env_create(&m_env, filename, flags, mode, param); + if (st) + throw error(st); + } + + /** Opens an existing Environment. */ + void open(const char *filename, uint32_t flags = 0, + const ham_parameter_t *param = 0) { + ham_status_t st = ham_env_open(&m_env, filename, flags, param); + if (st) + throw error(st); + } + + /** Flushes the Environment to disk. */ + void flush(uint32_t flags = 0) { + ham_status_t st = ham_env_flush(m_env, flags); + if (st) + throw error(st); + } + + /** Creates a new Database in the Environment. */ + db create_db(uint16_t name, uint32_t flags = 0, + const ham_parameter_t *param = 0) { + ham_db_t *dbh; + + ham_status_t st = ham_env_create_db(m_env, &dbh, name, flags, param); + if (st) + throw error(st); + + return (hamsterdb::db(dbh)); + } + + /** Opens an existing Database in the Environment. */ + db open_db(uint16_t name, uint32_t flags = 0, + const ham_parameter_t *param = 0) { + ham_db_t *dbh; + + ham_status_t st = ham_env_open_db(m_env, &dbh, name, flags, param); + if (st) + throw error(st); + + return (hamsterdb::db(dbh)); + } + + /** Renames an existing Database in the Environment. */ + void rename_db(uint16_t oldname, uint16_t newname, uint32_t flags = 0) { + ham_status_t st = ham_env_rename_db(m_env, oldname, newname, flags); + if (st) + throw error(st); + } + + /** Deletes a Database from the Environment. */ + void erase_db(uint16_t name, uint32_t flags = 0) { + ham_status_t st = ham_env_erase_db(m_env, name, flags); + if (st) + throw error(st); + } + + /** Begin a new Transaction */ + txn begin(const char *name = 0) { + ham_txn_t *h; + ham_status_t st = ham_txn_begin(&h, m_env, name, 0, 0); + if (st) + throw error(st); + return (txn(h)); + } + + + /** Closes the Environment. */ + void close(uint32_t flags = 0) { + if (!m_env) + return; + // disable auto-cleanup; all objects will be destroyed when + // going out of scope + flags &= ~HAM_AUTO_CLEANUP; + ham_status_t st = ham_env_close(m_env, flags); + if (st) + throw error(st); + m_env = 0; + } + + /** Retrieves Environment parameters. */ + void get_parameters(ham_parameter_t *param) { + ham_status_t st = ham_env_get_parameters(m_env, param); + if (st) + throw error(st); + } + + /** Get all Database names. */ + std::vector<uint16_t> get_database_names() { + uint32_t count = 32; + ham_status_t st; + std::vector<uint16_t> v(count); + + for (;;) { + st = ham_env_get_database_names(m_env, &v[0], &count); + if (!st) + break; + if (st && st!=HAM_LIMITS_REACHED) + throw error(st); + count += 16; + v.resize(count); + } + + v.resize(count); + return (v); + } + + private: + ham_env_t *m_env; +}; + +} // namespace hamsterdb + +/** + * @} + */ + +#endif // HAMSTERDB_HPP diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_int.h b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_int.h new file mode 100644 index 0000000000..ec05ece264 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_int.h @@ -0,0 +1,319 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hamsterdb_int.h + * @brief Internal hamsterdb Embedded Storage functions. + * @author Christoph Rupp, chris@crupp.de + * + * Please be aware that the interfaces in this file are mostly for internal + * use. Unlike those in hamsterdb.h they are not stable and can be changed + * with every new version. + * + */ + +#ifndef HAM_HAMSTERDB_INT_H +#define HAM_HAMSTERDB_INT_H + +#include <ham/hamsterdb.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @defgroup ham_extended_api hamsterdb Enhanced API + * @{ + */ + +/** get the (non-persisted) flags of a key */ +#define ham_key_get_intflags(key) (key)->_flags + +/** + * set the flags of a key + * + * Note that the ham_find/ham_cursor_find/ham_cursor_find_ex flags must + * be defined such that those can peacefully co-exist with these; that's + * why those public flags start at the value 0x1000 (4096). + */ +#define ham_key_set_intflags(key, f) (key)->_flags=(f) + +/** + * Verifies the integrity of the Database + * + * This function is only interesting if you want to debug hamsterdb. + * + * @param db A valid Database handle + * @param flags Optional flags for the integrity check, combined with + * bitwise OR. Possible flags are: + * <ul> + * <li>@ref HAM_PRINT_GRAPH</li> Prints the Btree as a graph; stores + * the image as "graph.png" in the current working directory. It uses + * the "dot" tool from graphviz to generate the image. + * This functionality is only available in DEBUG builds! + * </ul> + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INTEGRITY_VIOLATED if the Database is broken + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_check_integrity(ham_db_t *db, uint32_t flags); + +/** Flag for ham_db_check_integrity */ +#define HAM_PRINT_GRAPH 1 + +/** + * Set a user-provided context pointer + * + * This function sets a user-provided context pointer. This can be any + * arbitrary pointer; it is stored in the Database handle and can be + * retrieved with @a ham_get_context_data. It is mainly used by Wrappers + * and language bindings. + * + * @param db A valid Database handle + * @param data The pointer to the context data + */ +HAM_EXPORT void HAM_CALLCONV +ham_set_context_data(ham_db_t *db, void *data); + +/** + * Retrieves a user-provided context pointer + * + * This function retrieves a user-provided context pointer. This can be any + * arbitrary pointer which was previously stored with @a ham_set_context_data. + * + * @param db A valid Database handle + * @param dont_lock Whether the Environment mutex should be locked or not + * this is used to avoid recursive locks when retrieving the context + * data in a compare function + * + * @return The pointer to the context data + */ +HAM_EXPORT void * HAM_CALLCONV +ham_get_context_data(ham_db_t *db, ham_bool_t dont_lock); + +/** + * Retrieves the Database handle of a Cursor + * + * @param cursor A valid Cursor handle + * + * @return @a The Database handle of @a cursor + */ +HAM_EXPORT ham_db_t * HAM_CALLCONV +ham_cursor_get_database(ham_cursor_t *cursor); + +typedef struct min_max_avg_u32_t { + uint32_t min; + uint32_t max; + uint32_t avg; + uint32_t _total; /* for calculating the average */ + uint32_t _instances; /* for calculating the average */ +} min_max_avg_u32_t; + +/* btree metrics */ +typedef struct btree_metrics_t { + /* the database name of the btree */ + uint16_t database_name; + + /* number of pages */ + uint64_t number_of_pages; + + /* number of keys */ + uint64_t number_of_keys; + + /* total btree space, including overhead */ + uint64_t total_btree_space; + + /* static overhead per page */ + uint32_t overhead_per_page; + + /* number of keys stored per page (w/o duplicates) */ + min_max_avg_u32_t keys_per_page; + + /* payload storage assigned to the KeyLists */ + min_max_avg_u32_t keylist_ranges; + + /* payload storage assigned to the RecordLists */ + min_max_avg_u32_t recordlist_ranges; + + /* storage assigned to the Indices (if available) */ + min_max_avg_u32_t keylist_index; + + /* storage assigned to the Indices (if available) */ + min_max_avg_u32_t recordlist_index; + + /* unused storage (i.e. gaps between pages, underfilled blocks etc) */ + min_max_avg_u32_t keylist_unused; + + /* unused storage (i.e. gaps between pages, underfilled blocks etc) */ + min_max_avg_u32_t recordlist_unused; + + /* number of blocks per page (if available) */ + min_max_avg_u32_t keylist_blocks_per_page; + + /* block sizes (if available) */ + min_max_avg_u32_t keylist_block_sizes; +} btree_metrics_t; + +/** + * Retrieves collected metrics from the hamsterdb Environment. Used mainly + * for testing. + * See below for the structure with the currently available metrics. + * This structure will change a lot; the first field is a version indicator + * that applications can use to verify that the structure layout is compatible. + * + * These metrics are NOT persisted to disk. + * + * Metrics marked "global" are stored globally and shared between multiple + * Environments. + */ +#define HAM_METRICS_VERSION 9 + +typedef struct ham_env_metrics_t { + /* the version indicator - must be HAM_METRICS_VERSION */ + uint16_t version; + + /* number of total allocations for the whole lifetime of the process */ + uint64_t mem_total_allocations; + + /* currently active allocations for the whole process */ + uint64_t mem_current_allocations; + + /* current amount of memory allocated and tracked by the process + * (excludes memory used by the kernel or not allocated with + * malloc/free) */ + uint64_t mem_current_usage; + + /* peak usage of memory (for the whole process) */ + uint64_t mem_peak_usage; + + /* the heap size of this process */ + uint64_t mem_heap_size; + + /* amount of pages fetched from disk */ + uint64_t page_count_fetched; + + /* amount of pages written to disk */ + uint64_t page_count_flushed; + + /* number of index pages in this Environment */ + uint64_t page_count_type_index; + + /* number of blob pages in this Environment */ + uint64_t page_count_type_blob; + + /* number of page-manager pages in this Environment */ + uint64_t page_count_type_page_manager; + + /* number of successful freelist hits */ + uint64_t freelist_hits; + + /* number of freelist misses */ + uint64_t freelist_misses; + + /* number of successful cache hits */ + uint64_t cache_hits; + + /* number of cache misses */ + uint64_t cache_misses; + + /* number of blobs allocated */ + uint64_t blob_total_allocated; + + /* number of blobs read */ + uint64_t blob_total_read; + + /* (global) number of btree page splits */ + uint64_t btree_smo_split; + + /* (global) number of btree page merges */ + uint64_t btree_smo_merge; + + /* (global) number of extended keys */ + uint64_t extended_keys; + + /* (global) number of extended duplicate tables */ + uint64_t extended_duptables; + + /* number of bytes that the log/journal flushes to disk */ + uint64_t journal_bytes_flushed; + + /* PRO: log/journal bytes before compression */ + uint64_t journal_bytes_before_compression; + + /* PRO: log/journal bytes after compression */ + uint64_t journal_bytes_after_compression; + + /* PRO: record bytes before compression */ + uint64_t record_bytes_before_compression; + + /* PRO: record bytes after compression */ + uint64_t record_bytes_after_compression; + + /* PRO: key bytes before compression */ + uint64_t key_bytes_before_compression; + + /* PRO: key bytes after compression */ + uint64_t key_bytes_after_compression; + + /* PRO: set to the max. SIMD lane width (0 if SIMD is not available) */ + int simd_lane_width; + + /* btree metrics for leaf nodes */ + btree_metrics_t btree_leaf_metrics; + + /* btree metrics for internal nodes */ + btree_metrics_t btree_internal_metrics; + +} ham_env_metrics_t; + +/** + * Retrieves the current metrics from an Environment + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_get_metrics(ham_env_t *env, ham_env_metrics_t *metrics); + +/** + * Returns @ref HAM_TRUE if this hamsterdb library was compiled with debug + * diagnostics, checks and asserts + */ +HAM_EXPORT ham_bool_t HAM_CALLCONV +ham_is_debug(); + +/** + * Returns @ref HAM_TRUE if this hamsterdb library is the commercial + * closed-source "hamsterdb pro" edition + */ +HAM_EXPORT ham_bool_t HAM_CALLCONV +ham_is_pro(); + +/** + * Returns the end time of the evaluation period, if this is an evaluation + * license of the commercial closed-source "hamsterdb pro"; + * returns 0 otherwise + */ +HAM_EXPORT uint32_t HAM_CALLCONV +ham_is_pro_evaluation(); + +/** + * @} + */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* HAM_HAMSTERDB_INT_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_ola.h b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_ola.h new file mode 100644 index 0000000000..f65b98b8b1 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_ola.h @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file hamsterdb_hola.h + * @brief Include file for hamsterdb OnLine Analytical functions + * @author Christoph Rupp, chris@crupp.de + * @version 2.1.10 + * + * This API is EXPERIMENTAL!! The interface is not yet stable. + */ + +#ifndef HAM_HAMSTERDB_OLA_H +#define HAM_HAMSTERDB_OLA_H + +#include <ham/hamsterdb.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * A predicate function with context parameters returning a bool value. + * + * The predicate function is applied to various analytical functions + * of this API and is generally used to select keys where a predicate applies. + */ +typedef struct { + /** A function pointer; receives a key, returns a bool */ + ham_bool_t (*predicate_func)(const void *key_data, uint16_t key_size, + void *context); + + /** User-supplied context data */ + void *context; + +} hola_bool_predicate_t; + + +/** + * A structure which returns the result of an operation. + * + * For now, the result is either a @a uint64_t counter or a @a double value. + * The @a type parameter specifies which one is used; @a type's value is + * one of @a HAM_TYPE_UINT64 or @a HAM_TYPE_REAL64. + */ +typedef struct { + union { + /** The result as a 64bit unsigned integer */ + uint64_t result_u64; + + /** The result as a 64bit real */ + double result_double; + } u; + + /** The actual type in the union - one of the @a HAM_TYPE_* macros */ + int type; + +} hola_result_t; + + +/** + * Counts the keys in a Database + * + * This is a non-distinct count. If the Database has duplicate keys then + * they are included in the count. + * + * The actual count is returned in @a result->u.result_u64. @a result->type + * is set to @a HAM_TYPE_U64. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_count(ham_db_t *db, ham_txn_t *txn, hola_result_t *result); + +/** + * Selectively counts the keys in a Database + * + * This is a non-distinct count. If the Database has duplicate keys then + * they are included in the count. The predicate function is applied to + * each key. If it returns true then the key (and its duplicates) is included + * in the count; otherwise the key is ignored. + * + * The actual count is returned in @a result->u.result_u64. @a result->type + * is set to @a HAM_TYPE_U64. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_count_if(ham_db_t *db, ham_txn_t *txn, hola_bool_predicate_t *pred, + hola_result_t *result); + +/** + * Counts the distinct keys in a Database + * + * This is a distinct count. If the Database has duplicate keys then + * they are not included in the count. + * + * The actual count is returned in @a result->u.result_u64. @a result->type + * is set to @a HAM_TYPE_U64. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_count_distinct(ham_db_t *db, ham_txn_t *txn, hola_result_t *result); + +/** + * Selectively counts the distinct keys in a Database + * + * This is a distinct count. If the Database has duplicate keys then + * they are not included in the count. The predicate function is applied to + * each key. If it returns true then the key is included in the count; + * otherwise the key is ignored. + * + * The actual count is returned in @a result->u.result_u64. @a result->type + * is set to @a HAM_TYPE_U64. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_count_distinct_if(ham_db_t *db, ham_txn_t *txn, + hola_bool_predicate_t *pred, hola_result_t *result); + +/** + * Calculates the average of all keys. + * + * This is a non-distinct function and includes all duplicate keys. + * + * Internally, a 64bit counter is used for the calculation. This function + * does not protect against an overflow of this counter. + * + * The keys in the database (@a db) have to be numeric, which means that + * the Database's type must be one of @a HAM_TYPE_UINT8, @a HAM_TYPE_UINT16, + * HAM_TYPE_UINT32, @a HAM_TYPE_UINT64, @a HAM_TYPE_REAL32 or + * @a HAM_TYPE_REAL64. + * + * The actual result is returned in @a result->u.result_u64 or + * @a result->u.result_double, depending on the Database's configuration. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + * @return @ref HAM_INV_PARAMETER if the database is not numeric + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_average(ham_db_t *db, ham_txn_t *txn, hola_result_t *result); + +/** + * Calculates the average of all keys where a predicate applies. + * + * This is a non-distinct function and includes all duplicate keys for which + * the predicate function returns true. + * + * Internally, a 64bit counter is used for the calculation. This function + * does not protect against an overflow of this counter. + * + * The keys in the database (@a db) have to be numeric, which means that + * the Database's type must be one of @a HAM_TYPE_UINT8, @a HAM_TYPE_UINT16, + * HAM_TYPE_UINT32, @a HAM_TYPE_UINT64, @a HAM_TYPE_REAL32 or + * @a HAM_TYPE_REAL64. + * + * The actual result is returned in @a result->u.result_u64 or + * @a result->u.result_double, depending on the Database's configuration. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + * @return @ref HAM_INV_PARAMETER if the database is not numeric + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_average_if(ham_db_t *db, ham_txn_t *txn, hola_bool_predicate_t *pred, + hola_result_t *result); + +/** + * Calculates the sum of all keys. + * + * This is a non-distinct function and includes all duplicate keys. + * + * Internally, a 64bit counter is used for the calculation. This function + * does not protect against an overflow of this counter. + * + * The keys in the database (@a db) have to be numeric, which means that + * the Database's type must be one of @a HAM_TYPE_UINT8, @a HAM_TYPE_UINT16, + * HAM_TYPE_UINT32, @a HAM_TYPE_UINT64, @a HAM_TYPE_REAL32 or + * @a HAM_TYPE_REAL64. + * + * The actual result is returned in @a result->u.result_u64 or + * @a result->u.result_double, depending on the Database's configuration. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + * @return @ref HAM_INV_PARAMETER if the database is not numeric + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_sum(ham_db_t *db, ham_txn_t *txn, hola_result_t *result); + +/** + * Calculates the sum of all keys where a predicate applies. + * + * This is a non-distinct function and includes all duplicate keys for which + * the predicate function returns true. + * + * Internally, a 64bit counter is used for the calculation. This function + * does not protect against an overflow of this counter. + * + * The keys in the database (@a db) have to be numeric, which means that + * the Database's type must be one of @a HAM_TYPE_UINT8, @a HAM_TYPE_UINT16, + * HAM_TYPE_UINT32, @a HAM_TYPE_UINT64, @a HAM_TYPE_REAL32 or + * @a HAM_TYPE_REAL64. + * + * The actual result is returned in @a result->u.result_u64 or + * @a result->u.result_double, depending on the Database's configuration. + * + * @return @ref HAM_SUCCESS upon success + * @return @ref HAM_INV_PARAMETER if one of the parameters is NULL + * @return @ref HAM_INV_PARAMETER if the database is not numeric + */ +HAM_EXPORT ham_status_t HAM_CALLCONV +hola_sum_if(ham_db_t *db, ham_txn_t *txn, hola_bool_predicate_t *pred, + hola_result_t *result); + +/** + * @} + */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* HAM_HAMSTERDB_OLA_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_srv.h b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_srv.h new file mode 100644 index 0000000000..83ffef8f2e --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/hamsterdb_srv.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HAM_HAMSTERDB_SRV_H +#define HAM_HAMSTERDB_SRV_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <ham/hamsterdb.h> + +/** + * @defgroup ham_server hamsterdb Embedded Server + * @{ + */ + +/** + * A configuration structure + * + * It is always recommended to initialize the full structure with zeroes + * before using it. + */ +typedef struct { + /** The server port */ + uint16_t port; + + /* Path of the access log, or NULL if no log should be written + * - currently NOT USED! */ + const char *access_log_path; + + /** Path of the error log, or NULL if no log should be written + * - currently NOT USED! */ + const char *error_log_path; + +} ham_srv_config_t; + +/** + * A server handle + */ +struct ham_srv_t; +typedef struct ham_srv_t ham_srv_t; + +/** + * Initialize the server + * + * This function initializes a ham_srv_t handle and starts the hamsterdb + * database server on the port specified in the configuration object. + * + * @param config A configuration structure + * @param srv A pointer to a ham_srv_t pointer; will be allocated + * if this function returns successfully + * + * @return HAM_SUCCESS on success + * @return HAM_OUT_OF_MEMORY if memory could not be allocated + */ +extern ham_status_t +ham_srv_init(ham_srv_config_t *config, ham_srv_t **srv); + +/** + * Add a hamsterdb Environment + * + * This function adds a new hamsterdb Environment to the server. The + * Environment has to be initialized properly by the caller. It will be + * served at ham://localhost:port/urlname, where @a port was specified + * for @ref ham_srv_init and @a urlname is the third parameter to this + * function. + * + * A client accessing this Environment will specify this URL as a filename, + * and hamsterdb will transparently connect to this server. + * + * @param srv A valid ham_srv_t handle + * @param env A valid hamsterdb Environment handle + * @param urlname URL of this Environment + * + * @return HAM_SUCCESS on success + * @return HAM_LIMITS_REACHED if more than the max. number of Environments + * were added (default limit: 128) + */ +extern ham_status_t +ham_srv_add_env(ham_srv_t *srv, ham_env_t *env, const char *urlname); + +/* + * Release memory and clean up + * + * @param srv A valid ham_srv_t handle + * + * @warning + * This function will not close open handles (i.e. of Databases, Cursors + * or Transactions). The caller has to close the remaining Environment + * handles (@see ham_env_close). + */ +extern void +ham_srv_close(ham_srv_t *srv); + +/** + * @} + */ + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* HAM_HAMSTERDB_SRV_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/msstdint.h b/plugins/Dbx_kv/src/hamsterdb/include/ham/msstdint.h new file mode 100644 index 0000000000..4fe0ef9a9b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/msstdint.h @@ -0,0 +1,259 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2013 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the product nor the names of its contributors may +// be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#if _MSC_VER >= 1600 // [ +#include <stdint.h> +#else // ] _MSC_VER >= 1600 [ + +#include <limits.h> + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include <wchar.h> +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h> +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +// These #ifndef's are needed to prevent collisions with <boost/cstdint.hpp>. +// Check out Issue 9 for the details. +#ifndef INTMAX_C // [ +# define INTMAX_C INT64_C +#endif // INTMAX_C ] +#ifndef UINTMAX_C // [ +# define UINTMAX_C UINT64_C +#endif // UINTMAX_C ] + +#endif // __STDC_CONSTANT_MACROS ] + +#endif // _MSC_VER >= 1600 ] + +#endif // _MSC_STDINT_H_ ] diff --git a/plugins/Dbx_kv/src/hamsterdb/include/ham/types.h b/plugins/Dbx_kv/src/hamsterdb/include/ham/types.h new file mode 100644 index 0000000000..54d75aa7e0 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/include/ham/types.h @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file types.h + * @brief Portable typedefs for hamsterdb Embedded Storage. + * @author Christoph Rupp, chris@crupp.de + * + */ + +#ifndef HAM_TYPES_H +#define HAM_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Check the operating system and word size + */ +#ifdef WIN32 +# undef HAM_OS_WIN32 +# define HAM_OS_WIN32 1 +# ifdef WIN64 +# undef HAM_64BIT +# define HAM_64BIT 1 +# elif WIN32 +# undef HAM_32BIT +# define HAM_32BIT 1 +# else +# error "Neither WIN32 nor WIN64 defined!" +# endif +#else /* posix? */ +# undef HAM_OS_POSIX +# define HAM_OS_POSIX 1 +# if defined(__LP64__) || defined(__LP64) || __WORDSIZE == 64 +# undef HAM_64BIT +# define HAM_64BIT 1 +# else +# undef HAM_32BIT +# define HAM_32BIT 1 +# endif +#endif + +#if defined(HAM_OS_POSIX) && defined(HAM_OS_WIN32) +# error "Unknown arch - neither HAM_OS_POSIX nor HAM_OS_WIN32 defined" +#endif + +/* + * improve memory debugging on WIN32 by using crtdbg.h (only MSVC + * compiler and debug builds!) + * + * make sure crtdbg.h is loaded before malloc.h! + */ +#if defined(_MSC_VER) && defined(HAM_OS_WIN32) +# if (defined(WIN32) || defined(__WIN32)) && !defined(UNDER_CE) +# if defined(DEBUG) || defined(_DEBUG) +# ifndef _CRTDBG_MAP_ALLOC +# define _CRTDBG_MAP_ALLOC 1 +# endif +# endif +# include <crtdbg.h> +# include <malloc.h> +# endif +#endif + +/* + * Create the EXPORT macro for Microsoft Visual C++ + */ +#ifndef HAM_EXPORT +# ifdef _MSC_VER +# define HAM_EXPORT __declspec(dllexport) +# else +# define HAM_EXPORT extern +# endif +#endif + +/* + * The default calling convention is cdecl + */ +#ifndef HAM_CALLCONV +# define HAM_CALLCONV +#endif + +/* + * Common typedefs. Since stdint.h is not available on older versions of + * Microsoft Visual Studio, they get declared here. + * http://msinttypes.googlecode.com/svn/trunk/stdint.h + */ +#if _MSC_VER +# include <ham/msstdint.h> +#else +# include <stdint.h> +#endif + +/* Deprecated typedefs; used prior to 2.1.9. Please do not use them! */ +typedef int64_t ham_s64_t; +typedef uint64_t ham_u64_t; +typedef int32_t ham_s32_t; +typedef uint32_t ham_u32_t; +typedef int16_t ham_s16_t; +typedef uint16_t ham_u16_t; +typedef int8_t ham_s8_t; +typedef uint8_t ham_u8_t; + +/* + * Undefine macros to avoid macro redefinitions + */ +#undef HAM_INVALID_FD +#undef HAM_FALSE +#undef HAM_TRUE + +/** + * a boolean type + */ +typedef int ham_bool_t; +#define HAM_FALSE 0 +#define HAM_TRUE (!HAM_FALSE) + +/** + * typedef for error- and status-code + */ +typedef int ham_status_t; + + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /* HAM_TYPES_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/0root/root.h b/plugins/Dbx_kv/src/hamsterdb/src/0root/root.h new file mode 100644 index 0000000000..38e003b7c7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/0root/root.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The root of all evil. This header file must be included *before all others*! + * + * @thread_safe: yes + * @exception_safe: nothrow + */ + +#ifndef HAM_ROOT_H +#define HAM_ROOT_H + +//#define HAM_ENABLE_HELGRIND 1 + +// some feature macros in config.h must be set *before* inclusion +// of any system headers to have the desired effect. +// assume sane default values if there is no config.h. +#ifdef HAVE_CONFIG_H +# include "../config.h" +#else +# define HAVE_MMAP 1 +# define HAVE_UNMMAP 1 +# define HAVE_PREAD 1 +# define HAVE_PWRITE 1 +#endif + +#include "ham/types.h" + +// check for a valid build +#if (!defined(HAM_DEBUG)) +# if (defined(_DEBUG) || defined(DEBUG)) +# define HAM_DEBUG 1 +# endif +#endif + +// the default cache size is 2 MB +#define HAM_DEFAULT_CACHE_SIZE (2 * 1024 * 1024) + +// the default page size is 16 kb +#define HAM_DEFAULT_PAGE_SIZE (16 * 1024) + +// use tcmalloc? +#if HAVE_GOOGLE_TCMALLOC_H == 1 +# if HAVE_LIBTCMALLOC_MINIMAL == 1 +# define HAM_USE_TCMALLOC 1 +# endif +#endif + +#include <stddef.h> +#define OFFSETOF(type, member) offsetof(type, member) + +// helper macros to improve CPU branch prediction +#if defined __GNUC__ +# define likely(x) __builtin_expect ((x), 1) +# define unlikely(x) __builtin_expect ((x), 0) +#else +# define likely(x) (x) +# define unlikely(x) (x) +#endif + +#ifdef WIN32 +// MSVC: disable warning about use of 'this' in base member initializer list +# pragma warning(disable:4355) +# define WIN32_MEAN_AND_LEAN +# include <windows.h> +#endif + +// some compilers define min and max as macros; this leads to errors +// when using std::min and std::max +#ifdef min +# undef min +#endif + +#ifdef max +# undef max +#endif + +// a macro to cast pointers to u64 and vice versa to avoid compiler +// warnings if the sizes of ptr and u64 are not equal +#if defined(HAM_32BIT) && (!defined(_MSC_VER)) +# define U64_TO_PTR(p) (uint8_t *)(int)p +# define PTR_TO_U64(p) (uint64_t)(int)p +#else +# define U64_TO_PTR(p) p +# define PTR_TO_U64(p) p +#endif + +#endif /* HAM_ROOT_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/abi.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/abi.h new file mode 100644 index 0000000000..57c086f24c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/abi.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Returns the demangled name of a class + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_ABI_H +#define HAM_ABI_H + +#include "0root/root.h" + +#ifdef HAVE_GCC_ABI_DEMANGLE +# include <cxxabi.h> +#endif + +#include <string> +#include <stdlib.h> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +template<class T> inline std::string +get_classname(const T& t) +{ +#ifdef HAVE_GCC_ABI_DEMANGLE + int status; + const std::type_info &ti = typeid(t); + char *name = abi::__cxa_demangle(ti.name(), 0, 0, &status); + if (!name) + return (""); + if (status) { + ::free(name); + return (""); + } + std::string s = name; + ::free(name); + return (s); +#else + return (""); +#endif +} + +} // namespace hamsterdb + +#endif /* HAM_ABI_H */ + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/dynamic_array.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/dynamic_array.h new file mode 100644 index 0000000000..8cd8e2c8b7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/dynamic_array.h @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A class managing a dynamically sized array for arbitrary types + * + * @exception_safe: strong + * @thread_safe: no + */ + +#ifndef HAM_DYNAMIC_ARRAY_H +#define HAM_DYNAMIC_ARRAY_H + +#include "0root/root.h" + +#include <stdlib.h> +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1mem/mem.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* + * The DynamicArray class is a dynamic, resizable array. The internal memory + * is released when the DynamicArray instance is destructed. + * + * Unlike std::vector, the DynamicArray uses libc functions for constructing, + * copying and initializing elements. + */ +template<typename T> +class DynamicArray +{ + public: + typedef T value_t; + typedef T *pointer_t; + + DynamicArray(size_t size = 0) + : m_ptr(0), m_size(0), m_own(true) { + resize(size); + } + + DynamicArray(size_t size, uint8_t fill_byte) + : m_ptr(0), m_size(0), m_own(true) { + resize(size); + if (m_ptr) + ::memset(m_ptr, fill_byte, sizeof(T) * m_size); + } + + ~DynamicArray() { + clear(); + } + + void append(const T *ptr, size_t size) { + size_t old_size = m_size; + T *p = (T *)resize(m_size + size); + ::memcpy(p + old_size, ptr, sizeof(T) * size); + } + + void copy(const T *ptr, size_t size) { + resize(size); + ::memcpy(m_ptr, ptr, sizeof(T) * size); + m_size = size; + } + + void overwrite(uint32_t position, const T *ptr, size_t size) { + ::memcpy(((uint8_t *)m_ptr) + position, ptr, sizeof(T) * size); + } + + T *resize(size_t size) { + if (size > m_size) { + m_ptr = Memory::reallocate<T>(m_ptr, sizeof(T) * size); + m_size = size; + } + return (m_ptr); + } + + T *resize(size_t size, uint8_t fill_byte) { + resize(size); + if (m_ptr) + ::memset(m_ptr, fill_byte, sizeof(T) * size); + return (m_ptr); + } + + size_t get_size() const { + return (m_size); + } + + void set_size(size_t size) { + m_size = size; + } + + T *get_ptr() { + return (m_ptr); + } + + const T *get_ptr() const { + return (m_ptr); + } + + void assign(T *ptr, size_t size) { + clear(); + m_ptr = ptr; + m_size = size; + } + + void clear(bool release_memory = true) { + if (m_own && release_memory) + Memory::release(m_ptr); + m_ptr = 0; + m_size = 0; + } + + bool is_empty() const { + return (m_size == 0); + } + + void disown() { + m_own = false; + } + + private: + // Pointer to the data + T *m_ptr; + + // The size of the array + size_t m_size; + + // True if the destructor should free the pointer + bool m_own; +}; + +/* + * A ByteArray is a DynamicArray for bytes + */ +typedef DynamicArray<uint8_t> ByteArray; + +} // namespace hamsterdb + +#endif // HAM_DYNAMIC_ARRAY_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/error.cc b/plugins/Dbx_kv/src/hamsterdb/src/1base/error.cc new file mode 100644 index 0000000000..c7ebc530bb --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/error.cc @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/util.h" +#include "1globals/globals.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +void (*ham_test_abort)(void); + +static int +dbg_snprintf(char *str, size_t size, const char *format, ...) +{ + int s; + + va_list ap; + va_start(ap, format); + s = util_vsnprintf(str, size, format, ap); + va_end(ap); + + return (s); +} + +void HAM_CALLCONV +default_errhandler(int level, const char *message) +{ +#ifndef HAM_DEBUG + if (level == HAM_DEBUG_LEVEL_DEBUG) + return; +#endif + fprintf(stderr, "%s\n", message); +} + +void +dbg_prepare(int level, const char *file, int line, const char *function, + const char *expr) +{ + Globals::ms_error_level = level; + Globals::ms_error_file = file; + Globals::ms_error_line = line; + Globals::ms_error_expr = expr; + Globals::ms_error_function = function; +} + +void +dbg_log(const char *format, ...) +{ + int s = 0; + char buffer[1024 * 4]; + + va_list ap; + va_start(ap, format); +#ifdef HAM_DEBUG + s = dbg_snprintf(buffer, sizeof(buffer), "%s[%d]: ", + Globals::ms_error_file, Globals::ms_error_line); + util_vsnprintf(buffer + s, sizeof(buffer) - s, format, ap); +#else + if (Globals::ms_error_function) + s = dbg_snprintf(buffer, sizeof(buffer), "%s: ", + Globals::ms_error_function); + util_vsnprintf(buffer + s, sizeof(buffer) - s, format, ap); +#endif + va_end(ap); + + Globals::ms_error_handler(Globals::ms_error_level, buffer); +} + +/* coverity[+kill] */ +void +dbg_verify_failed(int level, const char *file, int line, const char *function, + const char *expr) +{ + char buffer[1024 * 4]; + + if (!expr) + expr = "(none)"; + + dbg_snprintf(buffer, sizeof(buffer), + "ASSERT FAILED in file %s, line %d:\n\t\"%s\"\n", + file, line, expr); + buffer[sizeof(buffer) - 1] = '\0'; + + Globals::ms_error_handler(Globals::ms_error_level, buffer); + + if (ham_test_abort) + ham_test_abort(); + else + abort(); +} + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/error.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/error.h new file mode 100644 index 0000000000..f02a8a8c24 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/error.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Error handling routines, assert macros, logging facilities + * + * @exception_safe: nothrow + * @thread_safe: no (b/c of the logging macros) + */ + +#ifndef HAM_ERROR_H +#define HAM_ERROR_H + +#include "0root/root.h" + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// A generic exception for storing a status code +// +struct Exception +{ + Exception(ham_status_t st) + : code(st) { + } + + ham_status_t code; +}; + +// the default error handler +void HAM_CALLCONV +default_errhandler(int level, const char *message); + +extern void +dbg_prepare(int level, const char *file, int line, const char *function, + const char *expr); + +extern void +dbg_log(const char *format, ...); + +#define CLANG_ANALYZER_NORETURN +#if __clang__ +# if __has_feature(attribute_analyzer_noreturn) +# undef CLANG_ANALYZER_NORETURN +# define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn)) +# endif +#endif + +// causes the actual abort() +extern void +dbg_verify_failed(int level, const char *file, int line, + const char *function, const char *expr) CLANG_ANALYZER_NORETURN; + +// a hook for unittests; will be triggered when an assert fails +extern void (*ham_test_abort)(); + +// if your compiler does not support __FUNCTION__, you can define it here: +// #define __FUNCTION__ 0 + +/* + * in debug mode we write trace()-messages to stderr, and assert() + * is enabled. + * + * not every preprocessor supports ellipsis as macro-arguments - + * therefore we have to use brackets, so preprocessors treat multiple + * arguments like a single argument. and we need to lock the output, + * otherwise we are not thread-safe. this is super-ugly. + */ +#ifdef HAM_DEBUG +# define ham_assert(e) while (!(e)) { \ + hamsterdb::dbg_verify_failed(HAM_DEBUG_LEVEL_FATAL, __FILE__, \ + __LINE__, __FUNCTION__, #e); \ + break; \ + } +#else /* !HAM_DEBUG */ +# define ham_assert(e) (void)0 +#endif /* HAM_DEBUG */ + +// ham_log() and ham_verify() are available in every build +#define ham_trace(f) do { \ + hamsterdb::dbg_prepare(HAM_DEBUG_LEVEL_DEBUG, __FILE__, \ + __LINE__, __FUNCTION__, 0); \ + hamsterdb::dbg_log f; \ + } while (0) + +#define ham_log(f) do { \ + hamsterdb::dbg_prepare(HAM_DEBUG_LEVEL_NORMAL, __FILE__, \ + __LINE__, __FUNCTION__, 0); \ + hamsterdb::dbg_log f; \ + } while (0) + +#define ham_verify(e) if (!(e)) { \ + hamsterdb::dbg_verify_failed(HAM_DEBUG_LEVEL_FATAL, __FILE__, \ + __LINE__, __FUNCTION__, #e); \ + } + +} // namespace hamsterdb + +#endif /* HAM_ERROR_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/mutex.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/mutex.h new file mode 100644 index 0000000000..0e09ae046c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/mutex.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A operating-system dependent mutex + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_MUTEX_H +#define HAM_MUTEX_H + +#include "0root/root.h" + +#define BOOST_ALL_NO_LIB // disable MSVC auto-linking +#include <boost/version.hpp> +#include <boost/thread/mutex.hpp> +#include <boost/thread/recursive_mutex.hpp> +#include <boost/thread/thread.hpp> +#include <boost/thread/tss.hpp> +#include <boost/thread/condition.hpp> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +typedef boost::mutex::scoped_lock ScopedLock; +typedef boost::thread Thread; +typedef boost::condition Condition; +typedef boost::mutex Mutex; +typedef boost::recursive_mutex RecursiveMutex; + +} // namespace hamsterdb + +#endif /* HAM_MUTEX_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/packstart.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/packstart.h new file mode 100644 index 0000000000..3a6b1981a7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/packstart.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Macros for packing structures; should work with most compilers. + * + * Example usage: + * + * #include "packstart.h" + * + * typedef HAM_PACK_0 struct HAM_PACK_1 foo { + * int bar; + * } HAM_PACK_2 foo_t; + * + * #include "packstop.h" + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +/* This class does NOT include root.h! */ + +#ifdef __GNUC__ +# if (((__GNUC__==2) && (__GNUC_MINOR__>=7)) || (__GNUC__>2)) +# define HAM_PACK_2 __attribute__ ((packed)) +# define _NEWGNUC_ +# endif +#endif + +#ifdef __WATCOMC__ +# define HAM_PACK_0 _Packed +#endif + +#if (defined(_MSC_VER) && (_MSC_VER >= 900)) || defined(__BORLANDC__) +# define _NEWMSC_ +#endif +#if !defined(_NEWGNUC_) && !defined(__WATCOMC__) && !defined(_NEWMSC_) +# pragma pack(1) +#endif +#ifdef _NEWMSC_ +# pragma pack(push, 1) +# define HAM_PACK_2 __declspec(align(1)) +#endif + +#if defined(_NEWMSC_) && !defined(_WIN32_WCE) +# pragma pack(push, 1) +# define HAM_PACK_2 __declspec(align(1)) +#endif + +#ifndef HAM_PACK_0 +# define HAM_PACK_0 +#endif + +#ifndef HAM_PACK_1 +# define HAM_PACK_1 +#endif + +#ifndef HAM_PACK_2 +# define HAM_PACK_2 +#endif + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/packstop.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/packstop.h new file mode 100644 index 0000000000..a32566f4f9 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/packstop.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Macros for packing structures; should work with most compilers. + * See packstart.h for a usage example. + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +/* This class does NOT include root.h! */ + +#if !defined(_NEWGNUC_) && !defined(__WATCOMC__) && !defined(_NEWMSC_) +# pragma pack() +#endif +#ifdef _NEWMSC_ +# pragma pack(pop) +#endif +#if defined(_NEWMSC_) && !defined(_WIN32_WCE) +# pragma pack(pop) +#endif + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/pickle.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/pickle.h new file mode 100644 index 0000000000..8927e08910 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/pickle.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Class for pickling/unpickling data to a buffer + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_PICKLE_H +#define HAM_PICKLE_H + +#include "0root/root.h" + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Pickle { + /* encodes a uint64 number and stores it in |p|; returns the number of + * bytes used */ + static size_t encode_u64(uint8_t *p, uint64_t n) { + if (n <= 0xf) { + *p = (uint8_t)n; + return (1); + } + if (n <= 0xff) { + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (2); + } + if (n <= 0xfff) { + *(p + 2) = (n & 0xf00) >> 8; + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (3); + } + if (n <= 0xffff) { + *(p + 3) = (n & 0xf000) >> 12; + *(p + 2) = (n & 0xf00) >> 8; + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (4); + } + if (n <= 0xfffff) { + *(p + 4) = (n & 0xf0000) >> 16; + *(p + 3) = (n & 0xf000) >> 12; + *(p + 2) = (n & 0xf00) >> 8; + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (5); + } + if (n <= 0xffffff) { + *(p + 5) = (n & 0xf00000) >> 24; + *(p + 4) = (n & 0xf0000) >> 16; + *(p + 3) = (n & 0xf000) >> 12; + *(p + 2) = (n & 0xf00) >> 8; + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (6); + } + if (n <= 0xfffffff) { + *(p + 6) = (n & 0xf000000) >> 32; + *(p + 5) = (n & 0xf00000) >> 24; + *(p + 4) = (n & 0xf0000) >> 16; + *(p + 3) = (n & 0xf000) >> 12; + *(p + 2) = (n & 0xf00) >> 8; + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (7); + } + *(p + 7) = (n & 0xf0000000) >> 36; + *(p + 6) = (n & 0xf000000) >> 32; + *(p + 5) = (n & 0xf00000) >> 24; + *(p + 4) = (n & 0xf0000) >> 16; + *(p + 3) = (n & 0xf000) >> 12; + *(p + 2) = (n & 0xf00) >> 8; + *(p + 1) = (n & 0xf0) >> 4; + *(p + 0) = n & 0xf; + return (8); + } + + /* decodes and returns a pickled number of |len| bytes */ + static uint64_t decode_u64(size_t len, uint8_t *p) { + uint64_t ret = 0; + + for (size_t i = 0; i < len - 1; i++) { + ret += *(p + (len - i - 1)); + ret <<= 4; + } + + // last assignment is without *= 10 + return (ret + *p); + } +}; + +} // namespace hamsterdb + +#endif // HAM_PICKLE_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/scoped_ptr.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/scoped_ptr.h new file mode 100644 index 0000000000..b920059aad --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/scoped_ptr.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A (stupid) smart pointer + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_SCOPED_PTR_H +#define HAM_SCOPED_PTR_H + +#include "0root/root.h" + +#define BOOST_ALL_NO_LIB // disable MSVC auto-linking +#include <boost/scoped_ptr.hpp> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +template <typename T> +struct ScopedPtr : public boost::scoped_ptr<T> +{ + ScopedPtr() + : boost::scoped_ptr<T>() { + } + + ScopedPtr(T *t) + : boost::scoped_ptr<T>(t) { + } +}; + +} // namespace hamsterdb + +#endif /* HAM_SCOPED_PTR_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/spinlock.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/spinlock.h new file mode 100644 index 0000000000..e9d917212c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/spinlock.h @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A fast spinlock, taken from the boost documentation + * http://www.boost.org/doc/libs/1_57_0/doc/html/atomic/usage_examples.html + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_SPINLOCK_H +#define HAM_SPINLOCK_H + +#include "0root/root.h" + +#include <stdio.h> +#ifndef HAM_OS_WIN32 +# include <sched.h> +# include <unistd.h> +#endif +#include <boost/atomic.hpp> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/mutex.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +#ifdef HAM_ENABLE_HELGRIND +typedef Mutex Spinlock; +#else + +class Spinlock { + typedef enum { + kLocked, + kUnlocked, + kSpinThreshold = 10 + } LockState; + + public: + Spinlock() + : m_state(kUnlocked) { + } + + // Need user-defined copy constructor because boost::atomic<> is not + // copyable + Spinlock(const Spinlock &other) + : m_state(other.m_state.load()) { + } + + void lock() { + int k = 0; + while (m_state.exchange(kLocked, boost::memory_order_acquire) == kLocked) + spin(++k); + } + + void unlock() { + m_state.store(kUnlocked, boost::memory_order_release); + } + + bool try_lock() { + return (m_state.exchange(kLocked, boost::memory_order_acquire) + != kLocked); + } + + static void spin(int loop) { + if (loop < kSpinThreshold) { +#ifdef HAM_OS_WIN32 + ::Sleep(0); +#elif HAVE_SCHED_YIELD + ::sched_yield(); +#else + ham_assert(!"Please implement me"); +#endif + } + else { +#ifdef HAM_OS_WIN32 + ::Sleep(25); +#elif HAVE_USLEEP + ::usleep(25); +#else + ham_assert(!"Please implement me"); +#endif + } + } + + private: + boost::atomic<LockState> m_state; +}; +#endif // HAM_ENABLE_HELGRIND + +class ScopedSpinlock { + public: + ScopedSpinlock(Spinlock &lock) + : m_spinlock(lock) { + m_spinlock.lock(); + } + + ~ScopedSpinlock() { + m_spinlock.unlock(); + } + + private: + Spinlock &m_spinlock; +}; + +} // namespace hamsterdb + +#endif /* HAM_SPINLOCK_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/util.cc b/plugins/Dbx_kv/src/hamsterdb/src/1base/util.cc new file mode 100644 index 0000000000..828fb3ec9d --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/util.cc @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Always verify that a file of level N does not include headers > N! +#include "1base/util.h" + +namespace hamsterdb { + +int +util_vsnprintf(char *str, size_t size, const char *format, va_list ap) +{ +#if defined(HAM_OS_POSIX) + return vsnprintf(str, size, format, ap); +#elif defined(HAM_OS_WIN32) + return _vsnprintf(str, size, format, ap); +#else + (void)size; + return (vsprintf(str, format, ap)); +#endif +} + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1base/util.h b/plugins/Dbx_kv/src/hamsterdb/src/1base/util.h new file mode 100644 index 0000000000..4e7857bd34 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1base/util.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Misc. utility classes and functions + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_UTIL_H +#define HAM_UTIL_H + +#include "0root/root.h" + +#include <stdarg.h> +#include <stdio.h> +#include <string.h> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// vsnprintf replacement/wrapper +// +// uses vsprintf on platforms which do not define vsnprintf +// +extern int +util_vsnprintf(char *str, size_t size, const char *format, va_list ap); + +// +// snprintf replacement/wrapper +// +// uses sprintf on platforms which do not define snprintf +// +#ifndef HAM_OS_POSIX +# define util_snprintf _snprintf +#else +# define util_snprintf snprintf +#endif + +} // namespace hamsterdb + +#endif // HAM_UTIL_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1errorinducer/errorinducer.cc b/plugins/Dbx_kv/src/hamsterdb/src/1errorinducer/errorinducer.cc new file mode 100644 index 0000000000..9f343c5ed6 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1errorinducer/errorinducer.cc @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1errorinducer/errorinducer.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +ErrorInducer ErrorInducer::ms_instance; +bool ErrorInducer::ms_is_active = false; + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1errorinducer/errorinducer.h b/plugins/Dbx_kv/src/hamsterdb/src/1errorinducer/errorinducer.h new file mode 100644 index 0000000000..4a7b2107af --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1errorinducer/errorinducer.h @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Facility to simulate errors + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_ERRORINDUCER_H +#define HAM_ERRORINDUCER_H + +#include "0root/root.h" + +#include <string.h> + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +// a macro to invoke errors +#define HAM_INDUCE_ERROR(id) \ + while (ErrorInducer::is_active()) { \ + ham_status_t st = ErrorInducer::get_instance()->induce(id); \ + if (st) \ + throw Exception(st); \ + break; \ + } + +namespace hamsterdb { + +class ErrorInducer { + struct State { + State() + : loops(0), error(HAM_INTERNAL_ERROR) { + } + + int loops; + ham_status_t error; + }; + + public: + enum Action { + // simulates a failure in Changeset::flush + kChangesetFlush, + + // simulates a hang in hamserver-connect + kServerConnect, + + kMaxActions + }; + + // Activates or deactivates the error inducer + static void activate(bool active) { + ms_is_active = active; + } + + // Returns true if the error inducer is active + static bool is_active() { + return (ms_is_active); + } + + // Returns the singleton instance + static ErrorInducer *get_instance() { + return (&ms_instance); + } + + ErrorInducer() { + memset(&m_state[0], 0, sizeof(m_state)); + } + + void add(Action action, int loops, + ham_status_t error = HAM_INTERNAL_ERROR) { + m_state[action].loops = loops; + m_state[action].error = error; + } + + ham_status_t induce(Action action) { + ham_assert(m_state[action].loops >= 0); + if (m_state[action].loops > 0 && --m_state[action].loops == 0) + return (m_state[action].error); + return (0); + } + + private: + State m_state[kMaxActions]; + + // The singleton instance + static ErrorInducer ms_instance; + + // Is the ErrorInducer active? + static bool ms_is_active; +}; + +} // namespace hamsterdb + +#endif /* HAM_ERRORINDUCER_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1globals/globals.cc b/plugins/Dbx_kv/src/hamsterdb/src/1globals/globals.cc new file mode 100644 index 0000000000..9f5d184c55 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1globals/globals.cc @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +uint64_t Globals::ms_extended_keys; + +uint64_t Globals::ms_extended_duptables; + +uint32_t Globals::ms_extended_threshold; + +uint32_t Globals::ms_duplicate_threshold; + +int Globals::ms_linear_threshold; + +int Globals::ms_error_level; + +const char *Globals::ms_error_file; + +int Globals::ms_error_line; + +const char *Globals::ms_error_expr; + +const char *Globals::ms_error_function; + +// the default error handler +void HAM_CALLCONV default_errhandler(int level, const char *message); + +ham_errhandler_fun Globals::ms_error_handler = default_errhandler; + +uint64_t Globals::ms_bytes_before_compression; + +uint64_t Globals::ms_bytes_after_compression; + +bool Globals::ms_is_simd_enabled = true; + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1globals/globals.h b/plugins/Dbx_kv/src/hamsterdb/src/1globals/globals.h new file mode 100644 index 0000000000..efe3449e93 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1globals/globals.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Global variables; used for tests and metrics + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_GLOBALS_H +#define HAM_GLOBALS_H + +#include "0root/root.h" + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Globals { + // for counting extended keys + static uint64_t ms_extended_keys; + + // for counting extended duplicate tables + static uint64_t ms_extended_duptables; + + // Move every key > threshold to a blob. For testing purposes. + // TODO currently gets assigned at runtime + static uint32_t ms_extended_threshold; + + // Create duplicate table if amount of duplicates > threshold. For testing + // purposes. + // TODO currently gets assigned at runtime + static uint32_t ms_duplicate_threshold; + + // linear search threshold for the PAX layout + static int ms_linear_threshold; + + // used in error.h/error.cc + static int ms_error_level; + + // used in error.h/error.cc + static const char *ms_error_file; + + // used in error.h/error.cc + static int ms_error_line; + + // used in error.h/error.cc + static const char *ms_error_expr; + + // used in error.h/error.cc + static const char *ms_error_function; + + // used in error.h/error.cc + static ham_errhandler_fun ms_error_handler; + + // PRO: Tracking key bytes before compression + static uint64_t ms_bytes_before_compression; + + // PRO: Tracking key bytes after compression + static uint64_t ms_bytes_after_compression; + + // PRO: enable/disable SIMD + static bool ms_is_simd_enabled; +}; + +} // namespace hamsterdb + +#endif /* HAM_GLOBALS_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1mem/mem.cc b/plugins/Dbx_kv/src/hamsterdb/src/1mem/mem.cc new file mode 100644 index 0000000000..58a00b87c3 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1mem/mem.cc @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#ifdef HAM_USE_TCMALLOC +# include <google/tcmalloc.h> +# include <google/malloc_extension.h> +#endif +#include <stdlib.h> + +#include "ham/hamsterdb_int.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/file.h" +#include "1mem/mem.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +uint64_t Memory::ms_peak_memory; +uint64_t Memory::ms_total_allocations; +uint64_t Memory::ms_current_allocations; + +void +Memory::get_global_metrics(ham_env_metrics_t *metrics) +{ +#ifdef HAM_USE_TCMALLOC + size_t value = 0; + MallocExtension::instance()->GetNumericProperty( + "generic.current_allocated_bytes", &value); + metrics->mem_current_usage = value; + if (ms_peak_memory < value) + ms_peak_memory = metrics->mem_peak_usage = value; + MallocExtension::instance()->GetNumericProperty( + "generic.heap_size", &value); + metrics->mem_heap_size = value; +#endif + + metrics->mem_total_allocations = ms_total_allocations; + metrics->mem_current_allocations = ms_current_allocations; +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1mem/mem.h b/plugins/Dbx_kv/src/hamsterdb/src/1mem/mem.h new file mode 100644 index 0000000000..13f79b618c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1mem/mem.h @@ -0,0 +1,151 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Memory handling + * + * @exception_safe: nothrow + * @thread_safe: no (b/c of metrics) + */ + +#ifndef HAM_MEM_H +#define HAM_MEM_H + +#include "0root/root.h" + +#include <new> +#include <stdlib.h> +#ifdef HAM_USE_TCMALLOC +# include <google/tcmalloc.h> +#endif + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +struct ham_env_metrics_t; + +namespace hamsterdb { + +/* + * The static Memory class provides memory management functions in a common + * c++ namespace. The functions can allocate, reallocate and free memory + * while tracking usage statistics. + * + * If tcmalloc is used then additional metrics will be available. + * + * This class only has static members and methods. It does not have a + * constructor. + */ +class Memory { + public: + // allocates |size| bytes, casted into type |T *|; + // returns null if out of memory. + // usage: + // + // char *p = Memory::allocate<char>(1024); + // + template<typename T> + static T *allocate(size_t size) { + ms_total_allocations++; + ms_current_allocations++; +#ifdef HAM_USE_TCMALLOC + T *t = (T *)::tc_malloc(size); +#else + T *t = (T *)::malloc(size); +#endif + if (!t) + throw Exception(HAM_OUT_OF_MEMORY); + return (t); + } + + // allocates |size| bytes; returns null if out of memory. initializes + // the allocated memory with zeroes. + // usage: + // + // const char *p = Memory::callocate<const char>(50); + // + template<typename T> + static T *callocate(size_t size) { + ms_total_allocations++; + ms_current_allocations++; + +#ifdef HAM_USE_TCMALLOC + T *t = (T *)::tc_calloc(1, size); +#else + T *t = (T *)::calloc(1, size); +#endif + if (!t) + throw Exception(HAM_OUT_OF_MEMORY); + return (t); + } + + // re-allocates |ptr| for |size| bytes; returns null if out of memory. + // |ptr| can be null on first use. + // usage: + // + // p = Memory::reallocate<char>(p, 100); + // + template<typename T> + static T *reallocate(T *ptr, size_t size) { + if (ptr == 0) { + ms_total_allocations++; + ms_current_allocations++; + } +#ifdef HAM_USE_TCMALLOC + T *t = (T *)::tc_realloc(ptr, size); +#else + T *t = (T *)::realloc(ptr, size); +#endif + if (!t) + throw Exception(HAM_OUT_OF_MEMORY); + return (t); + } + + // releases a memory block; can deal with NULL pointers. + static void release(void *ptr) { + if (ptr) { + ms_current_allocations--; +#ifdef HAM_USE_TCMALLOC + ::tc_free(ptr); +#else + ::free(ptr); +#endif + } + } + + // updates and returns the collected metrics + static void get_global_metrics(ham_env_metrics_t *metrics); + + private: + // peak memory usage + static uint64_t ms_peak_memory; + + // total memory allocations + static uint64_t ms_total_allocations; + + // currently active allocations + static uint64_t ms_current_allocations; +}; + +} // namespace hamsterdb + +#endif /* HAM_MEM_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1os/file.h b/plugins/Dbx_kv/src/hamsterdb/src/1os/file.h new file mode 100644 index 0000000000..df9049c6de --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1os/file.h @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A simple wrapper around a file handle. Throws exceptions in + * case of errors. Moves the file handle when copied. + * + * @exception_safe: strong + * @thread_safe: unknown + */ + +#ifndef HAM_FILE_H +#define HAM_FILE_H + +#include "0root/root.h" + +#include <stdio.h> +#include <limits.h> + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/os.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class File +{ + public: + enum { +#ifdef HAM_OS_POSIX + kSeekSet = SEEK_SET, + kSeekEnd = SEEK_END, + kSeekCur = SEEK_CUR, + kMaxPath = PATH_MAX +#else + kSeekSet = FILE_BEGIN, + kSeekEnd = FILE_END, + kSeekCur = FILE_CURRENT, + kMaxPath = MAX_PATH +#endif + }; + + // Constructor: creates an empty File handle + File() + : m_fd(HAM_INVALID_FD), m_mmaph(HAM_INVALID_FD), m_posix_advice(0) { + } + + // Copy constructor: moves ownership of the file handle + File(File &other) + : m_fd(other.m_fd), m_mmaph(other.m_mmaph), + m_posix_advice(other.m_posix_advice) { + other.m_fd = HAM_INVALID_FD; + other.m_mmaph = HAM_INVALID_FD; + } + + // Destructor: closes the file + ~File() { + close(); + } + + // Assignment operator: moves ownership of the file handle + File &operator=(File &other) { + m_fd = other.m_fd; + other.m_fd = HAM_INVALID_FD; + return (*this); + } + + // Creates a new file + void create(const char *filename, uint32_t mode); + + // Opens an existing file + void open(const char *filename, bool read_only); + + // Returns true if the file is open + bool is_open() const { + return (m_fd != HAM_INVALID_FD); + } + + // Flushes a file + void flush(); + + // Sets the parameter for posix_fadvise() + void set_posix_advice(int parameter); + + // Maps a file in memory + // + // mmap is called with MAP_PRIVATE - the allocated buffer + // is just a copy of the file; writing to the buffer will not alter + // the file itself. + void mmap(uint64_t position, size_t size, bool readonly, + uint8_t **buffer); + + // Unmaps a buffer + void munmap(void *buffer, size_t size); + + // Positional read from a file + void pread(uint64_t addr, void *buffer, size_t len); + + // Positional write to a file + void pwrite(uint64_t addr, const void *buffer, size_t len); + + // Write data to a file; uses the current file position + void write(const void *buffer, size_t len); + + // Get the page allocation granularity of the operating system + static size_t get_granularity(); + + // Seek position in a file + void seek(uint64_t offset, int whence); + + // Tell the position in a file + uint64_t tell(); + + // Returns the size of the file + uint64_t get_file_size(); + + // Truncate/resize the file + void truncate(uint64_t newsize); + + // Closes the file descriptor + void close(); + + private: + // The file handle + ham_fd_t m_fd; + + // The mmap handle - required for Win32 + ham_fd_t m_mmaph; + + // Parameter for posix_fadvise() + int m_posix_advice; +}; + +} // namespace hamsterdb + +#endif /* HAM_FILE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1os/os.cc b/plugins/Dbx_kv/src/hamsterdb/src/1os/os.cc new file mode 100644 index 0000000000..8f8c0c991c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1os/os.cc @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "1os/os.h" + +namespace hamsterdb { + +int +os_get_simd_lane_width() +{ + // only supported in hamsterdb pro + return (0); +} + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1os/os.h b/plugins/Dbx_kv/src/hamsterdb/src/1os/os.h new file mode 100644 index 0000000000..dd2f52a4dc --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1os/os.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Abstraction layer for operating system functions + * + * @exception_safe: basic // for socket + * @exception_safe: strong // for file + * @thread_safe: unknown + */ + +#ifndef HAM_OS_H +#define HAM_OS_H + +#include "0root/root.h" + +#include <stdio.h> +#include <limits.h> + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* + * typedefs for posix + */ +#ifdef HAM_OS_POSIX +typedef int ham_fd_t; +typedef int ham_socket_t; +# define HAM_INVALID_FD (-1) +#endif + +/* + * typedefs for Windows 32- and 64-bit + */ +#ifdef HAM_OS_WIN32 +# ifdef CYGWIN +typedef int ham_fd_t; +typedef int ham_socket_t; +# else +typedef HANDLE ham_fd_t; +typedef SOCKET ham_socket_t; +# endif +# define HAM_INVALID_FD (0) +#endif + +// Returns the number of 32bit integers that the CPU can process in +// parallel (the SIMD lane width) +extern int +os_get_simd_lane_width(); + +} // namespace hamsterdb + +#endif /* HAM_OS_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1os/os_posix.cc b/plugins/Dbx_kv/src/hamsterdb/src/1os/os_posix.cc new file mode 100644 index 0000000000..135899e7ea --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1os/os_posix.cc @@ -0,0 +1,474 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define _GNU_SOURCE 1 // for O_LARGEFILE +#define _FILE_OFFSET_BITS 64 + +#include "0root/root.h" + +#include <stdio.h> +#include <errno.h> +#include <string.h> +#if HAVE_MMAP +# include <sys/mman.h> +#endif +#if HAVE_WRITEV +# include <sys/uio.h> +#endif +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netdb.h> +#include <fcntl.h> +#include <unistd.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1os/file.h" +#include "1os/socket.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +#if 0 +# define os_log(x) ham_log(x) +#else +# define os_log(x) +#endif + +static void +lock_exclusive(int fd, bool lock) +{ +#ifdef HAM_SOLARIS + // SunOS 5.9 doesn't have LOCK_* unless i include /usr/ucbinclude; but then, + // mmap behaves strangely (the first write-access to the mmapped buffer + // leads to a segmentation fault). + // + // Tell me if this troubles you/if you have suggestions for fixes. +#else + int flags; + + if (lock) + flags = LOCK_EX | LOCK_NB; + else + flags = LOCK_UN; + + if (0 != flock(fd, flags)) { + ham_log(("flock failed with status %u (%s)", errno, strerror(errno))); + // it seems that linux does not only return EWOULDBLOCK, as stated + // in the documentation (flock(2)), but also other errors... + if (errno && lock) + throw Exception(HAM_WOULD_BLOCK); + throw Exception(HAM_IO_ERROR); + } +#endif +} + +static void +enable_largefile(int fd) +{ + // not available on cygwin... +#ifdef HAVE_O_LARGEFILE + int oflag = fcntl(fd, F_GETFL, 0); + fcntl(fd, F_SETFL, oflag | O_LARGEFILE); +#endif +} + +static void +os_read(ham_fd_t fd, uint8_t *buffer, size_t len) +{ + os_log(("os_read: fd=%d, size=%lld", fd, len)); + + int r; + size_t total = 0; + + while (total < len) { + r = read(fd, &buffer[total], len - total); + if (r < 0) { + ham_log(("os_read failed with status %u (%s)", errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + if (r == 0) + break; + total += r; + } + + if (total != len) { + ham_log(("os_read() failed with short read (%s)", strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +} + +static void +os_write(ham_fd_t fd, const void *buffer, size_t len) +{ + int w; + size_t total = 0; + const char *p = (const char *)buffer; + + while (total < len) { + w = ::write(fd, p + total, len - total); + if (w < 0) { + ham_log(("os_write failed with status %u (%s)", errno, + strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + if (w == 0) + break; + total += w; + } + + if (total != len) { + ham_log(("os_write() failed with short read (%s)", strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +} + +size_t +File::get_granularity() +{ + return ((size_t)sysconf(_SC_PAGE_SIZE)); +} + +void +File::set_posix_advice(int advice) +{ + m_posix_advice = advice; + ham_assert(m_fd != HAM_INVALID_FD); + +#if HAVE_POSIX_FADVISE + if (m_posix_advice == HAM_POSIX_FADVICE_RANDOM) { + int r = ::posix_fadvise(m_fd, 0, 0, POSIX_FADV_RANDOM); + if (r != 0) { + ham_log(("posix_fadvise failed with status %d (%s)", + errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + } +#endif +} + +void +File::mmap(uint64_t position, size_t size, bool readonly, uint8_t **buffer) +{ + os_log(("File::mmap: fd=%d, position=%lld, size=%lld", m_fd, position, size)); + + int prot = PROT_READ; + if (!readonly) + prot |= PROT_WRITE; + +#if HAVE_MMAP + *buffer = (uint8_t *)::mmap(0, size, prot, MAP_PRIVATE, m_fd, position); + if (*buffer == (void *)-1) { + *buffer = 0; + ham_log(("mmap failed with status %d (%s)", errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +#else + throw Exception(HAM_NOT_IMPLEMENTED); +#endif + +#if HAVE_MADVISE + if (m_posix_advice == HAM_POSIX_FADVICE_RANDOM) { + int r = ::madvise(*buffer, size, MADV_RANDOM); + if (r != 0) { + ham_log(("madvise failed with status %d (%s)", errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + } +#endif +} + +void +File::munmap(void *buffer, size_t size) +{ + os_log(("File::munmap: size=%lld", size)); + +#if HAVE_MUNMAP + int r = ::munmap(buffer, size); + if (r) { + ham_log(("munmap failed with status %d (%s)", errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +#else + throw Exception(HAM_NOT_IMPLEMENTED); +#endif +} + +void +File::pread(uint64_t addr, void *buffer, size_t len) +{ +#if HAVE_PREAD + os_log(("File::pread: fd=%d, address=%lld, size=%lld", m_fd, addr, + len)); + + int r; + size_t total = 0; + + while (total < len) { + r = ::pread(m_fd, (uint8_t *)buffer + total, len - total, + addr + total); + if (r < 0) { + ham_log(("File::pread failed with status %u (%s)", errno, + strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + if (r == 0) + break; + total += r; + } + + if (total != len) { + ham_log(("File::pread() failed with short read (%s)", strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +#else + File::seek(addr, kSeekSet); + os_read(m_fd, (uint8_t *)buffer, len); +#endif +} + +void +File::pwrite(uint64_t addr, const void *buffer, size_t len) +{ + os_log(("File::pwrite: fd=%d, address=%lld, size=%lld", m_fd, addr, len)); + +#if HAVE_PWRITE + ssize_t s; + size_t total = 0; + + while (total < len) { + s = ::pwrite(m_fd, buffer, len, addr + total); + if (s < 0) { + ham_log(("pwrite() failed with status %u (%s)", errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + if (s == 0) + break; + total += s; + } + + if (total != len) { + ham_log(("pwrite() failed with short read (%s)", strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +#else + seek(addr, kSeekSet); + write(buffer, len); +#endif +} + +void +File::write(const void *buffer, size_t len) +{ + os_log(("File::write: fd=%d, size=%lld", m_fd, len)); + os_write(m_fd, buffer, len); +} + +void +File::seek(uint64_t offset, int whence) +{ + os_log(("File::seek: fd=%d, offset=%lld, whence=%d", m_fd, offset, whence)); + if (lseek(m_fd, offset, whence) < 0) + throw Exception(HAM_IO_ERROR); +} + +uint64_t +File::tell() +{ + uint64_t offset = lseek(m_fd, 0, SEEK_CUR); + os_log(("File::tell: fd=%d, offset=%lld", m_fd, offset)); + if (offset == (uint64_t) - 1) + throw Exception(HAM_IO_ERROR); + return (offset); +} + +uint64_t +File::get_file_size() +{ + seek(0, kSeekEnd); + uint64_t size = tell(); + os_log(("File::get_file_size: fd=%d, size=%lld", m_fd, size)); + return (size); +} + +void +File::truncate(uint64_t newsize) +{ + os_log(("File::truncate: fd=%d, size=%lld", m_fd, newsize)); + if (ftruncate(m_fd, newsize)) + throw Exception(HAM_IO_ERROR); +} + +void +File::create(const char *filename, uint32_t mode) +{ + int osflags = O_CREAT | O_RDWR | O_TRUNC; +#if HAVE_O_NOATIME + osflags |= O_NOATIME; +#endif + + ham_fd_t fd = ::open(filename, osflags, mode ? mode : 0644); + if (fd < 0) { + ham_log(("creating file %s failed with status %u (%s)", filename, + errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + + /* lock the file - this is default behaviour since 1.1.0 */ + lock_exclusive(fd, true); + + /* enable O_LARGEFILE support */ + enable_largefile(fd); + + m_fd = fd; +} + +void +File::flush() +{ + os_log(("File::flush: fd=%d", m_fd)); + /* unlike fsync(), fdatasync() does not flush the metadata unless + * it's really required. it's therefore a lot faster. */ +#if HAVE_FDATASYNC && !__APPLE__ + if (fdatasync(m_fd) == -1) { +#else + if (fsync(m_fd) == -1) { +#endif + ham_log(("fdatasync failed with status %u (%s)", + errno, strerror(errno))); + throw Exception(HAM_IO_ERROR); + } +} + +void +File::open(const char *filename, bool read_only) +{ + int osflags = 0; + + if (read_only) + osflags |= O_RDONLY; + else + osflags |= O_RDWR; +#if HAVE_O_NOATIME + osflags |= O_NOATIME; +#endif + + ham_fd_t fd = ::open(filename, osflags); + if (fd < 0) { + ham_log(("opening file %s failed with status %u (%s)", filename, + errno, strerror(errno))); + throw Exception(errno == ENOENT ? HAM_FILE_NOT_FOUND : HAM_IO_ERROR); + } + + /* lock the file - this is default behaviour since 1.1.0 */ + lock_exclusive(fd, true); + + /* enable O_LARGEFILE support */ + enable_largefile(fd); + + m_fd = fd; +} + +void +File::close() +{ + if (m_fd != HAM_INVALID_FD) { + // on posix, we most likely don't want to close descriptors 0 and 1 + ham_assert(m_fd != 0 && m_fd != 1); + + // unlock the file - this is default behaviour since 1.1.0 + lock_exclusive(m_fd, false); + + // now close the descriptor + if (::close(m_fd) == -1) + throw Exception(HAM_IO_ERROR); + + m_fd = HAM_INVALID_FD; + } +} + +void +Socket::connect(const char *hostname, uint16_t port, uint32_t timeout_sec) +{ + ham_socket_t s = ::socket(AF_INET, SOCK_STREAM, 0); + if (s < 0) { + ham_log(("failed creating socket: %s", strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + + struct hostent *server = ::gethostbyname(hostname); + if (!server) { + ham_log(("unable to resolve hostname %s: %s", hostname, + hstrerror(h_errno))); + ::close(s); + throw Exception(HAM_NETWORK_ERROR); + } + + struct sockaddr_in addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length); + addr.sin_port = htons(port); + if (::connect(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + ham_log(("unable to connect to %s:%d: %s", hostname, (int)port, + strerror(errno))); + ::close(s); + throw Exception(HAM_NETWORK_ERROR); + } + + if (timeout_sec) { + struct timeval tv; + tv.tv_sec = timeout_sec; + tv.tv_usec = 0; + if (::setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)) < 0) { + ham_log(("unable to set socket timeout to %d sec: %s", timeout_sec, + strerror(errno))); + // fall through, this is not critical + } + } + + m_socket = s; +} + +void +Socket::send(const uint8_t *data, size_t len) +{ + os_write(m_socket, data, len); +} + +void +Socket::recv(uint8_t *data, size_t len) +{ + os_read(m_socket, data, len); +} + +void +Socket::close() +{ + if (m_socket != HAM_INVALID_FD) { + if (::close(m_socket) == -1) + throw Exception(HAM_IO_ERROR); + m_socket = HAM_INVALID_FD; + } +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1os/os_win32.cc b/plugins/Dbx_kv/src/hamsterdb/src/1os/os_win32.cc new file mode 100644 index 0000000000..ac51a4a7b7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1os/os_win32.cc @@ -0,0 +1,542 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <winsock2.h> +#include <windows.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <assert.h> + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1os/file.h" +#include "1os/socket.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +static const char * +DisplayError(char* buf, uint32_t buflen, DWORD errorcode) +{ + size_t len; + + buf[0] = 0; + FormatMessageA(/* FORMAT_MESSAGE_ALLOCATE_BUFFER | */ + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, errorcode, + MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), + (LPSTR)buf, buflen, NULL); + buf[buflen - 1] = 0; + + /* strip trailing whitespace\newlines */ + for (len = strlen(buf); len-- > 0; ) { + if (!isspace(buf[len])) + break; + buf[len] = 0; + } + + return (buf); +} + +/* + * MS says: + * + * Security Alert + * + * Using the MultiByteToWideChar function incorrectly can compromise the + * security of your application. Calling this function can easily cause a + * buffer overrun because the size of the input buffer indicated by + * lpMultiByteStr equals the number of bytes in the string, while the size of + * the output buffer indicated by lpWideCharStr equals the number of WCHAR + * values. + * + * To avoid a buffer overrun, your application must specify a buffer size + * appropriate for the data type the buffer receives. For more information, see + * Security Considerations: International Features. + */ +static void +utf8_string(const char *filename, WCHAR *wfilename, int wlen) +{ + MultiByteToWideChar(CP_ACP, 0, filename, -1, wfilename, wlen); +} + +static int +calc_wlen4str(const char *str) +{ + // Since we call MultiByteToWideChar with an input length of -1, the + // output will include the wchar NUL sentinel as well, so count it + return (int)(strlen(str) + 1); +} + +size_t +File::get_granularity() +{ + SYSTEM_INFO info; + GetSystemInfo(&info); + return ((size_t)info.dwAllocationGranularity); +} + +void +File::set_posix_advice(int advice) +{ + // Only available for posix platforms +} + +void +File::mmap(uint64_t position, size_t size, bool readonly, uint8_t **buffer) +{ + ham_status_t st; + DWORD protect = (readonly ? PAGE_READONLY : PAGE_WRITECOPY); + DWORD access = FILE_MAP_COPY; + LARGE_INTEGER i; + i.QuadPart = position; + + m_mmaph = CreateFileMapping(m_fd, 0, protect, 0, 0, 0); + if (!m_mmaph) { + char buf[256]; + *buffer = 0; + st = (ham_status_t)GetLastError(); + ham_log(("CreateFileMapping failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + + *buffer = (uint8_t *)MapViewOfFile(m_mmaph, access, i.HighPart, i.LowPart, + (SIZE_T)size); + if (!*buffer) { + char buf[256]; + st = (ham_status_t)GetLastError(); + /* make sure to release the mapping */ + (void)CloseHandle(m_mmaph); + m_mmaph = HAM_INVALID_FD; + ham_log(("MapViewOfFile failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + if (st == ERROR_NOT_ENOUGH_QUOTA) // not enough resources - fallback to r/w + throw Exception(HAM_LIMITS_REACHED); + throw Exception(HAM_IO_ERROR); + } +} + +void +File::munmap(void *buffer, size_t size) +{ + ham_status_t st; + + if (!UnmapViewOfFile(buffer)) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("UnMapViewOfFile failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + + if (m_mmaph != HAM_INVALID_FD) { + if (!CloseHandle(m_mmaph)) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("CloseHandle failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + } + + m_mmaph = HAM_INVALID_FD; +} + +void +File::pread(uint64_t addr, void *buffer, size_t len) +{ + ham_status_t st; + OVERLAPPED ov = { 0 }; + ov.Offset = (DWORD)addr; + ov.OffsetHigh = addr >> 32; + DWORD read; + if (!::ReadFile(m_fd, buffer, (DWORD)len, &read, &ov)) { + if (GetLastError() != ERROR_IO_PENDING) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("ReadFile failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + if (!::GetOverlappedResult(m_fd, &ov, &read, TRUE)) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("GetOverlappedResult failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + } + + if (read != len) + throw Exception(HAM_IO_ERROR); +} + +void +File::pwrite(uint64_t addr, const void *buffer, size_t len) +{ + ham_status_t st; + OVERLAPPED ov = { 0 }; + ov.Offset = (DWORD)addr; + ov.OffsetHigh = addr >> 32; + DWORD written; + if (!::WriteFile(m_fd, buffer, (DWORD)len, &written, &ov)) { + if (GetLastError() != ERROR_IO_PENDING) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("WriteFile failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + if (!::GetOverlappedResult(m_fd, &ov, &written, TRUE)) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("GetOverlappedResult failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + } + + if (written != len) + throw Exception(HAM_IO_ERROR); +} + +void +File::write(const void *buffer, size_t len) +{ + ham_status_t st; + DWORD written = 0; + + if (!WriteFile(m_fd, buffer, (DWORD)len, &written, 0)) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("WriteFile failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + + if (written != len) + throw Exception(HAM_IO_ERROR); +} + +#ifndef INVALID_SET_FILE_POINTER +# define INVALID_SET_FILE_POINTER ((DWORD)-1) +#endif + +void +File::seek(uint64_t offset, int whence) +{ + DWORD st; + LARGE_INTEGER i; + i.QuadPart = offset; + + i.LowPart = ::SetFilePointer(m_fd, i.LowPart, &i.HighPart, whence); + if (i.LowPart == INVALID_SET_FILE_POINTER && + (st = GetLastError())!=NO_ERROR) { + char buf[256]; + ham_log(("SetFilePointer failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } +} + +uint64_t +File::tell() +{ + DWORD st; + LARGE_INTEGER i; + i.QuadPart = 0; + + i.LowPart = SetFilePointer(m_fd, i.LowPart, &i.HighPart, kSeekCur); + if (i.LowPart == INVALID_SET_FILE_POINTER && + (st = GetLastError()) != NO_ERROR) { + char buf[256]; + ham_log(("SetFilePointer failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + + return ((size_t)i.QuadPart); +} + +#ifndef INVALID_FILE_SIZE +# define INVALID_FILE_SIZE ((DWORD)-1) +#endif + +uint64_t +File::get_file_size() +{ + ham_status_t st; + LARGE_INTEGER i; + i.QuadPart = 0; + i.LowPart = GetFileSize(m_fd, (LPDWORD)&i.HighPart); + + if (i.LowPart == INVALID_FILE_SIZE && (st = GetLastError()) != NO_ERROR) { + char buf[256]; + ham_log(("GetFileSize failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + + return ((size_t)i.QuadPart); +} + +void +File::truncate(uint64_t newsize) +{ + File::seek(newsize, kSeekSet); + + if (!SetEndOfFile(m_fd)) { + char buf[256]; + ham_status_t st = (ham_status_t)GetLastError(); + ham_log(("SetEndOfFile failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } +} + +void +File::create(const char *filename, uint32_t mode) +{ + ham_status_t st; + DWORD share = 0; /* 1.1.0: default behaviour is exclusive locking */ + DWORD access = GENERIC_READ | GENERIC_WRITE; + ham_fd_t fd; + +#ifdef UNICODE + int fnameWlen = calc_wlen4str(filename); + WCHAR *wfilename = (WCHAR *)malloc(fnameWlen * sizeof(wfilename[0])); + if (!wfilename) + throw Exception(HAM_OUT_OF_MEMORY); + + /* translate ASCII filename to unicode */ + utf8_string(filename, wfilename, fnameWlen); + fd = (ham_fd_t)CreateFileW(wfilename, access, + share, NULL, CREATE_ALWAYS, + FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, 0); + free(wfilename); +#else + fd = (ham_fd_t)CreateFileA(filename, access, + share, NULL, CREATE_ALWAYS, + FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED, 0); +#endif + + if (fd == INVALID_HANDLE_VALUE) { + char buf[256]; + st = (ham_status_t)GetLastError(); + if (st == ERROR_SHARING_VIOLATION) + throw Exception(HAM_WOULD_BLOCK); + ham_log(("CreateFile(%s, %x, %x, ...) (create) failed with OS status " + "%u (%s)", filename, access, share, st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + + m_fd = fd; +} + +void +File::flush() +{ + ham_status_t st; + + if (!FlushFileBuffers(m_fd)) { + char buf[256]; + st = (ham_status_t)GetLastError(); + ham_log(("FlushFileBuffers failed with OS status %u (%s)", + st, DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } +} + +void +File::open(const char *filename, bool read_only) +{ + ham_status_t st; + DWORD share = 0; /* 1.1.0: default behaviour is exclusive locking */ + DWORD access = read_only + ? GENERIC_READ + : (GENERIC_READ | GENERIC_WRITE); + DWORD dispo = OPEN_EXISTING; + DWORD osflags = 0; + ham_fd_t fd; + +#ifdef UNICODE + { + int fnameWlen = calc_wlen4str(filename); + WCHAR *wfilename = (WCHAR *)malloc(fnameWlen * sizeof(wfilename[0])); + if (!wfilename) + throw Exception(HAM_OUT_OF_MEMORY); + + /* translate ASCII filename to unicode */ + utf8_string(filename, wfilename, fnameWlen); + fd = (ham_fd_t)CreateFileW(wfilename, access, share, NULL, + dispo, osflags, 0); + free(wfilename); + } +#else + fd = (ham_fd_t)CreateFileA(filename, access, share, NULL, + dispo, osflags, 0); +#endif + + if (fd == INVALID_HANDLE_VALUE) { + char buf[256]; + fd = HAM_INVALID_FD; + st = (ham_status_t)GetLastError(); + ham_log(("CreateFile(%s, %x, %x, ...) (open) failed with OS status " + "%u (%s)", filename, access, share, + st, DisplayError(buf, sizeof(buf), st))); + if (st == ERROR_SHARING_VIOLATION) + throw Exception(HAM_WOULD_BLOCK); + throw Exception(st == ERROR_FILE_NOT_FOUND + ? HAM_FILE_NOT_FOUND + : HAM_IO_ERROR); + } + + m_fd = fd; +} + +void +File::close() +{ + if (m_fd != HAM_INVALID_FD) { + if (!CloseHandle((HANDLE)m_fd)) { + char buf[256]; + ham_status_t st = (ham_status_t)GetLastError(); + ham_log(("CloseHandle failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + m_fd = HAM_INVALID_FD; + } + + if (m_mmaph != HAM_INVALID_FD) { + if (!CloseHandle((HANDLE)m_mmaph)) { + char buf[256]; + ham_status_t st = (ham_status_t)GetLastError(); + ham_log(("CloseHandle failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + m_mmaph = HAM_INVALID_FD; + } +} + +void +Socket::connect(const char *hostname, uint16_t port, uint32_t timeout_sec) +{ + WORD sockVersion = MAKEWORD(1, 1); + WSADATA wsaData; + WSAStartup(sockVersion, &wsaData); + + ham_socket_t s = ::socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s < 0) { + ham_log(("failed creating socket: %s", strerror(errno))); + throw Exception(HAM_IO_ERROR); + } + + LPHOSTENT server = ::gethostbyname(hostname); + if (!server) { + ham_log(("unable to resolve hostname %s", hostname)); + ::closesocket(s); + throw Exception(HAM_NETWORK_ERROR); + } + + SOCKADDR_IN addr; + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr = *((LPIN_ADDR)*server->h_addr_list); + addr.sin_port = htons(port); + if (::connect(s, (LPSOCKADDR)&addr, sizeof(addr)) < 0) { + ham_log(("unable to connect to %s:%d: %s", hostname, (int)port, + strerror(errno))); + ::closesocket(s); + throw Exception(HAM_NETWORK_ERROR); + } + + if (timeout_sec) { + struct timeval tv; + tv.tv_sec = timeout_sec; + tv.tv_usec = 0; + if (::setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)) < 0) { + char buf[256]; + ham_log(("unable to set socket timeout to %u sec: %u/%s", timeout_sec, + WSAGetLastError(), DisplayError(buf, sizeof(buf), + WSAGetLastError()))); + // fall through, this is not critical + } + } + + m_socket = s; +} + +void +Socket::send(const uint8_t *data, size_t len) +{ + size_t sent = 0; + char buf[256]; + ham_status_t st; + + while (sent != len) { + int s = ::send(m_socket, (const char *)(data + sent), len - sent, 0); + if (s <= 0) { + st = (ham_status_t)GetLastError(); + ham_log(("send failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + sent += s; + } +} + +void +Socket::recv(uint8_t *data, size_t len) +{ + size_t read = 0; + char buf[256]; + ham_status_t st; + + while (read != len) { + int r = ::recv(m_socket, (char *)(data + read), len - read, 0); + if (r <= 0) { + st = (ham_status_t)GetLastError(); + ham_log(("recv failed with OS status %u (%s)", st, + DisplayError(buf, sizeof(buf), st))); + throw Exception(HAM_IO_ERROR); + } + read += r; + } +} + +void +Socket::close() +{ + if (m_socket != HAM_INVALID_FD) { + if (::closesocket(m_socket) == -1) + throw Exception(HAM_IO_ERROR); + m_socket = HAM_INVALID_FD; + } +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1os/socket.h b/plugins/Dbx_kv/src/hamsterdb/src/1os/socket.h new file mode 100644 index 0000000000..0acdfdd14e --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1os/socket.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A simple wrapper around a tcp socket handle. Throws exceptions in + * case of errors + * + * @exception_safe: basic + * @thread_safe: unknown + */ + +#ifndef HAM_SOCKET_H +#define HAM_SOCKET_H + +#include "0root/root.h" + +#include <stdio.h> +#include <limits.h> + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/os.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Socket +{ + public: + // Constructor creates an empty socket + Socket() + : m_socket(HAM_INVALID_FD) { + } + + // Destructor closes the socket + ~Socket() { + close(); + } + + // Connects to a remote host + void connect(const char *hostname, uint16_t port, uint32_t timeout_sec); + + // Sends data to the connected server + void send(const uint8_t *data, size_t len); + + // Receives data from the connected server; blocking! + void recv(uint8_t *data, size_t len); + + // Closes the connection; no problem if socket was already closed + void close(); + + private: + ham_socket_t m_socket; +}; + +} // namespace hamsterdb + +#endif /* HAM_SOCKET_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/1rb/rb.h b/plugins/Dbx_kv/src/hamsterdb/src/1rb/rb.h new file mode 100644 index 0000000000..fcf0c135d5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/1rb/rb.h @@ -0,0 +1,977 @@ +/*- + ******************************************************************************* + * + * cpp macro implementation of left-leaning 2-3 red-black trees. Parent + * pointers are not used, and color bits are stored in the least significant + * bit of right-child pointers (if RB_COMPACT is defined), thus making node + * linkage as compact as is possible for red-black trees. + * + * Usage: + * + * #include <stdint.h> + * #include <stdbool.h> + * #define NDEBUG // (Optional, see assert(3).) + * #include <assert.h> + * #define RB_COMPACT // (Optional, embed color bits in right-child pointers.) + * #include <rb.h> + * ... + * + ******************************************************************************* + */ + +#ifndef RB_H_ +#define RB_H_ + +#include "0root/root.h" + +#ifndef HAM_OS_WIN32 +# include <stdint.h> +# include <sys/cdefs.h> +#endif +#include <assert.h> + +#ifdef RB_COMPACT +/* Node structure. */ +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right_red; \ +} +#else +#define rb_node(a_type) \ +struct { \ + a_type *rbn_left; \ + a_type *rbn_right; \ + bool rbn_red; \ +} +#endif + +/* Root structure. */ +#define rbt(a_type) \ +struct { \ + a_type *rbt_root; \ + a_type rbt_nil; \ +} + +/* Left accessors. */ +#define rbtn_left_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_left) +#define rbtn_left_set(a_type, a_field, a_node, a_left) do { \ + (a_node)->a_field.rbn_left = a_left; \ +} while (0) + +#ifdef RB_COMPACT +/* Right accessors. */ +#define rbtn_right_get(a_type, a_field, a_node) \ + ((a_type *) (((intptr_t) (a_node)->a_field.rbn_right_red) \ + & ((ssize_t)-2))) +#define rbtn_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) a_right) \ + | (((uintptr_t) (a_node)->a_field.rbn_right_red) & ((size_t)1))); \ +} while (0) + +/* Color accessors. */ +#define rbtn_red_get(a_type, a_field, a_node) \ + ((bool) (((uintptr_t) (a_node)->a_field.rbn_right_red) \ + & ((size_t)1))) +#define rbtn_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) ((((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)) \ + | ((ssize_t)a_red)); \ +} while (0) +#define rbtn_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((uintptr_t) \ + (a_node)->a_field.rbn_right_red) | ((size_t)1)); \ +} while (0) +#define rbtn_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_right_red = (a_type *) (((intptr_t) \ + (a_node)->a_field.rbn_right_red) & ((ssize_t)-2)); \ +} while (0) +#else +/* Right accessors. */ +#define rbtn_right_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_right) +#define rbtn_right_set(a_type, a_field, a_node, a_right) do { \ + (a_node)->a_field.rbn_right = a_right; \ +} while (0) + +/* Color accessors. */ +#define rbtn_red_get(a_type, a_field, a_node) \ + ((a_node)->a_field.rbn_red) +#define rbtn_color_set(a_type, a_field, a_node, a_red) do { \ + (a_node)->a_field.rbn_red = (a_red); \ +} while (0) +#define rbtn_red_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = true; \ +} while (0) +#define rbtn_black_set(a_type, a_field, a_node) do { \ + (a_node)->a_field.rbn_red = false; \ +} while (0) +#endif + +/* Node initializer. */ +#define rbt_node_new(a_type, a_field, a_rbt, a_node) do { \ + rbtn_left_set(a_type, a_field, (a_node), &(a_rbt)->rbt_nil); \ + rbtn_right_set(a_type, a_field, (a_node), &(a_rbt)->rbt_nil); \ + rbtn_red_set(a_type, a_field, (a_node)); \ +} while (0) + +/* Tree initializer. */ +#define rb_new(a_type, a_field, a_rbt) do { \ + (a_rbt)->rbt_root = &(a_rbt)->rbt_nil; \ + rbt_node_new(a_type, a_field, a_rbt, &(a_rbt)->rbt_nil); \ + rbtn_black_set(a_type, a_field, &(a_rbt)->rbt_nil); \ +} while (0) + +/* Internal utility macros. */ +#define rbtn_first(a_type, a_field, a_rbt, a_root, r_node) do { \ + (r_node) = (a_root); \ + if ((r_node) != &(a_rbt)->rbt_nil) { \ + for (; \ + rbtn_left_get(a_type, a_field, (r_node)) != &(a_rbt)->rbt_nil;\ + (r_node) = rbtn_left_get(a_type, a_field, (r_node))) { \ + } \ + } \ +} while (0) + +#define rbtn_last(a_type, a_field, a_rbt, a_root, r_node) do { \ + (r_node) = (a_root); \ + if ((r_node) != &(a_rbt)->rbt_nil) { \ + for (; rbtn_right_get(a_type, a_field, (r_node)) != \ + &(a_rbt)->rbt_nil; (r_node) = rbtn_right_get(a_type, a_field, \ + (r_node))) { \ + } \ + } \ +} while (0) + +#define rbtn_rotate_left(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbtn_right_get(a_type, a_field, (a_node)); \ + rbtn_right_set(a_type, a_field, (a_node), \ + rbtn_left_get(a_type, a_field, (r_node))); \ + rbtn_left_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +#define rbtn_rotate_right(a_type, a_field, a_node, r_node) do { \ + (r_node) = rbtn_left_get(a_type, a_field, (a_node)); \ + rbtn_left_set(a_type, a_field, (a_node), \ + rbtn_right_get(a_type, a_field, (r_node))); \ + rbtn_right_set(a_type, a_field, (r_node), (a_node)); \ +} while (0) + +/* + * The rb_proto() macro generates function prototypes that correspond to the + * functions generated by an equivalently parameterized call to rb_gen(). + */ + +#define rb_proto(a_attr, a_prefix, a_rbt_type, a_type) \ +a_attr void \ +a_prefix##new(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##first(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##last(a_rbt_type *rbtree); \ +a_attr a_type * \ +a_prefix##next(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##prev(a_rbt_type *rbtree, a_type *node); \ +a_attr a_type * \ +a_prefix##search(a_rbt_type *rbtree, a_type *key); \ +a_attr a_type * \ +a_prefix##nsearch(a_rbt_type *rbtree, a_type *key); \ +a_attr a_type * \ +a_prefix##psearch(a_rbt_type *rbtree, a_type *key); \ +a_attr void \ +a_prefix##insert(a_rbt_type *rbtree, a_type *node); \ +a_attr void \ +a_prefix##remove(a_rbt_type *rbtree, a_type *node);/* \ +a_attr a_type * \ +a_prefix##iter(a_rbt_type *rbtree, a_type *start, a_type *(*cb)( \ + a_rbt_type *, a_type *, void *), void *arg); \ +a_attr a_type * \ +a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg);*/ + +/* + * The rb_gen() macro generates a type-specific red-black tree implementation, + * based on the above cpp macros. + * + * Arguments: + * + * a_attr : Function attribute for generated functions (ex: static). + * a_prefix : Prefix for generated functions (ex: ex_). + * a_rb_type : Type for red-black tree data structure (ex: ex_t). + * a_type : Type for red-black tree node data structure (ex: ex_node_t). + * a_field : Name of red-black tree node linkage (ex: ex_link). + * a_cmp : Node comparison function name, with the following prototype: + * int (a_cmp *)(a_type *a_node, a_type *a_other); + * ^^^^^^ + * or a_key + * Interpretation of comparision function return values: + * -1 : a_node < a_other + * 0 : a_node == a_other + * 1 : a_node > a_other + * In all cases, the a_node or a_key macro argument is the first + * argument to the comparison function, which makes it possible + * to write comparison functions that treat the first argument + * specially. + * + * Assuming the following setup: + * + * typedef struct ex_node_s ex_node_t; + * struct ex_node_s { + * rb_node(ex_node_t) ex_link; + * }; + * typedef rbt(ex_node_t) ex_t; + * rb_gen(static, ex_, ex_t, ex_node_t, ex_link, ex_cmp) + * + * The following API is generated: + * + * static void + * ex_new(ex_t *tree); + * Description: Initialize a red-black tree structure. + * Args: + * tree: Pointer to an uninitialized red-black tree object. + * + * static ex_node_t * + * ex_first(ex_t *tree); + * static ex_node_t * + * ex_last(ex_t *tree); + * Description: Get the first/last node in tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * Ret: First/last node in tree, or NULL if tree is empty. + * + * static ex_node_t * + * ex_next(ex_t *tree, ex_node_t *node); + * static ex_node_t * + * ex_prev(ex_t *tree, ex_node_t *node); + * Description: Get node's successor/predecessor. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: A node in tree. + * Ret: node's successor/predecessor in tree, or NULL if node is + * last/first. + * + * static ex_node_t * + * ex_search(ex_t *tree, ex_node_t *key); + * Description: Search for node that matches key. + * Args: + * tree: Pointer to an initialized red-black tree object. + * key : Search key. + * Ret: Node in tree that matches key, or NULL if no match. + * + * static ex_node_t * + * ex_nsearch(ex_t *tree, ex_node_t *key); + * static ex_node_t * + * ex_psearch(ex_t *tree, ex_node_t *key); + * Description: Search for node that matches key. If no match is found, + * return what would be key's successor/predecessor, were + * key in tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * key : Search key. + * Ret: Node in tree that matches key, or if no match, hypothetical node's + * successor/predecessor (NULL if no successor/predecessor). + * + * static void + * ex_insert(ex_t *tree, ex_node_t *node); + * Description: Insert node into tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: Node to be inserted into tree. + * + * static void + * ex_remove(ex_t *tree, ex_node_t *node); + * Description: Remove node from tree. + * Args: + * tree: Pointer to an initialized red-black tree object. + * node: Node in tree to be removed. + * + * static ex_node_t * + * ex_iter(ex_t *tree, ex_node_t *start, ex_node_t *(*cb)(ex_t *, + * ex_node_t *, void *), void *arg); + * static ex_node_t * + * ex_reverse_iter(ex_t *tree, ex_node_t *start, ex_node *(*cb)(ex_t *, + * ex_node_t *, void *), void *arg); + * Description: Iterate forward/backward over tree, starting at node. If + * tree is modified, iteration must be immediately + * terminated by the callback function that causes the + * modification. + * Args: + * tree : Pointer to an initialized red-black tree object. + * start: Node at which to start iteration, or NULL to start at + * first/last node. + * cb : Callback function, which is called for each node during + * iteration. Under normal circumstances the callback function + * should return NULL, which causes iteration to continue. If a + * callback function returns non-NULL, iteration is immediately + * terminated and the non-NULL return value is returned by the + * iterator. This is useful for re-starting iteration after + * modifying tree. + * arg : Opaque pointer passed to cb(). + * Ret: NULL if iteration completed, or the non-NULL callback return value + * that caused termination of the iteration. + */ +#define rb_gen(a_attr, a_prefix, a_rbt_type, a_type, a_field, a_cmp) \ +a_attr void \ +a_prefix##new(a_rbt_type *rbtree) { \ + rb_new(a_type, a_field, rbtree); \ +} \ +a_attr a_type * \ +a_prefix##first(a_rbt_type *rbtree) { \ + a_type *ret; \ + rbtn_first(a_type, a_field, rbtree, rbtree->rbt_root, ret); \ + if (ret == &rbtree->rbt_nil) { \ + ret = NULL; \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##last(a_rbt_type *rbtree) { \ + a_type *ret; \ + rbtn_last(a_type, a_field, rbtree, rbtree->rbt_root, ret); \ + if (ret == &rbtree->rbt_nil) { \ + ret = NULL; \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##next(a_rbt_type *rbtree, a_type *node) { \ + a_type *ret; \ + if (rbtn_right_get(a_type, a_field, node) != &rbtree->rbt_nil) { \ + rbtn_first(a_type, a_field, rbtree, rbtn_right_get(a_type, \ + a_field, node), ret); \ + } else { \ + a_type *tnode = rbtree->rbt_root; \ + assert(tnode != &rbtree->rbt_nil); \ + ret = &rbtree->rbt_nil; \ + while (true) { \ + int cmp = (a_cmp)(node, tnode); \ + if (cmp < 0) { \ + ret = tnode; \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + break; \ + } \ + assert(tnode != &rbtree->rbt_nil); \ + } \ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = (NULL); \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##prev(a_rbt_type *rbtree, a_type *node) { \ + a_type *ret; \ + if (rbtn_left_get(a_type, a_field, node) != &rbtree->rbt_nil) { \ + rbtn_last(a_type, a_field, rbtree, rbtn_left_get(a_type, \ + a_field, node), ret); \ + } else { \ + a_type *tnode = rbtree->rbt_root; \ + assert(tnode != &rbtree->rbt_nil); \ + ret = &rbtree->rbt_nil; \ + while (true) { \ + int cmp = (a_cmp)(node, tnode); \ + if (cmp < 0) { \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + ret = tnode; \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + break; \ + } \ + assert(tnode != &rbtree->rbt_nil); \ + } \ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = (NULL); \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##search(a_rbt_type *rbtree, a_type *key) { \ + a_type *ret; \ + int cmp; \ + ret = rbtree->rbt_root; \ + while (ret != &rbtree->rbt_nil \ + && (cmp = (a_cmp)(key, ret)) != 0) { \ + if (cmp < 0) { \ + ret = rbtn_left_get(a_type, a_field, ret); \ + } else { \ + ret = rbtn_right_get(a_type, a_field, ret); \ + } \ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = (NULL); \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##nsearch(a_rbt_type *rbtree, a_type *key) { \ + a_type *ret; \ + a_type *tnode = rbtree->rbt_root; \ + ret = &rbtree->rbt_nil; \ + while (tnode != &rbtree->rbt_nil) { \ + int cmp = (a_cmp)(key, tnode); \ + if (cmp < 0) { \ + ret = tnode; \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + ret = tnode; \ + break; \ + } \ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = (NULL); \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##psearch(a_rbt_type *rbtree, a_type *key) { \ + a_type *ret; \ + a_type *tnode = rbtree->rbt_root; \ + ret = &rbtree->rbt_nil; \ + while (tnode != &rbtree->rbt_nil) { \ + int cmp = (a_cmp)(key, tnode); \ + if (cmp < 0) { \ + tnode = rbtn_left_get(a_type, a_field, tnode); \ + } else if (cmp > 0) { \ + ret = tnode; \ + tnode = rbtn_right_get(a_type, a_field, tnode); \ + } else { \ + ret = tnode; \ + break; \ + } \ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = (NULL); \ + } \ + return (ret); \ +} \ +a_attr void \ +a_prefix##insert(a_rbt_type *rbtree, a_type *node) { \ + struct { \ + a_type *node; \ + int cmp; \ + } path[sizeof(void *) << 4], *pathp; \ + rbt_node_new(a_type, a_field, rbtree, node); \ + /* Wind. */ \ + path->node = rbtree->rbt_root; \ + for (pathp = path; pathp->node != &rbtree->rbt_nil; pathp++) { \ + int cmp = pathp->cmp = a_cmp(node, pathp->node); \ + assert(cmp != 0); \ + if (cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + } \ + } \ + pathp->node = node; \ + /* Unwind. */ \ + for (pathp--; (uintptr_t)pathp >= (uintptr_t)path; pathp--) { \ + a_type *cnode = pathp->node; \ + if (pathp->cmp < 0) { \ + a_type *left = pathp[1].node; \ + rbtn_left_set(a_type, a_field, cnode, left); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (rbtn_red_get(a_type, a_field, leftleft)) { \ + /* Fix up 4-node. */ \ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, cnode, tnode); \ + cnode = tnode; \ + } \ + } else { \ + return; \ + } \ + } else { \ + a_type *right = pathp[1].node; \ + rbtn_right_set(a_type, a_field, cnode, right); \ + if (rbtn_red_get(a_type, a_field, right)) { \ + a_type *left = rbtn_left_get(a_type, a_field, cnode); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + /* Split 4-node. */ \ + rbtn_black_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, right); \ + rbtn_red_set(a_type, a_field, cnode); \ + } else { \ + /* Lean left. */ \ + a_type *tnode; \ + bool tred = rbtn_red_get(a_type, a_field, cnode); \ + rbtn_rotate_left(a_type, a_field, cnode, tnode); \ + rbtn_color_set(a_type, a_field, tnode, tred); \ + rbtn_red_set(a_type, a_field, cnode); \ + cnode = tnode; \ + } \ + } else { \ + return; \ + } \ + } \ + pathp->node = cnode; \ + } \ + /* Set root, and make it black. */ \ + rbtree->rbt_root = path->node; \ + rbtn_black_set(a_type, a_field, rbtree->rbt_root); \ +} \ +a_attr void \ +a_prefix##remove(a_rbt_type *rbtree, a_type *node) { \ + struct { \ + a_type *node; \ + int cmp; \ + } *pathp, *nodep, path[sizeof(void *) << 4]; \ + /* Wind. */ \ + nodep = NULL; /* Silence compiler warning. */ \ + path->node = rbtree->rbt_root; \ + for (pathp = path; pathp->node != &rbtree->rbt_nil; pathp++) { \ + int cmp = pathp->cmp = a_cmp(node, pathp->node); \ + if (cmp < 0) { \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } else { \ + pathp[1].node = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + if (cmp == 0) { \ + /* Find node's successor, in preparation for swap. */ \ + pathp->cmp = 1; \ + nodep = pathp; \ + for (pathp++; pathp->node != &rbtree->rbt_nil; \ + pathp++) { \ + pathp->cmp = -1; \ + pathp[1].node = rbtn_left_get(a_type, a_field, \ + pathp->node); \ + } \ + break; \ + } \ + } \ + } \ + assert(nodep->node == node); \ + pathp--; \ + if (pathp->node != node) { \ + /* Swap node with its successor. */ \ + bool tred = rbtn_red_get(a_type, a_field, pathp->node); \ + rbtn_color_set(a_type, a_field, pathp->node, \ + rbtn_red_get(a_type, a_field, node)); \ + rbtn_left_set(a_type, a_field, pathp->node, \ + rbtn_left_get(a_type, a_field, node)); \ + /* If node's successor is its right child, the following code */\ + /* will do the wrong thing for the right child pointer. */\ + /* However, it doesn't matter, because the pointer will be */\ + /* properly set when the successor is pruned. */\ + rbtn_right_set(a_type, a_field, pathp->node, \ + rbtn_right_get(a_type, a_field, node)); \ + rbtn_color_set(a_type, a_field, node, tred); \ + /* The pruned leaf node's child pointers are never accessed */\ + /* again, so don't bother setting them to nil. */\ + nodep->node = pathp->node; \ + pathp->node = node; \ + if (nodep == path) { \ + rbtree->rbt_root = nodep->node; \ + } else { \ + if (nodep[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, nodep[-1].node, \ + nodep->node); \ + } else { \ + rbtn_right_set(a_type, a_field, nodep[-1].node, \ + nodep->node); \ + } \ + } \ + } else { \ + a_type *left = rbtn_left_get(a_type, a_field, node); \ + if (left != &rbtree->rbt_nil) { \ + /* node has no successor, but it has a left child. */\ + /* Splice node out, without losing the left child. */\ + assert(rbtn_red_get(a_type, a_field, node) == false); \ + assert(rbtn_red_get(a_type, a_field, left)); \ + rbtn_black_set(a_type, a_field, left); \ + if (pathp == path) { \ + rbtree->rbt_root = left; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + left); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + left); \ + } \ + } \ + return; \ + } else if (pathp == path) { \ + /* The tree only contained one node. */ \ + rbtree->rbt_root = &rbtree->rbt_nil; \ + return; \ + } \ + } \ + if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + /* Prune red node, which requires no fixup. */ \ + assert(pathp[-1].cmp < 0); \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + &rbtree->rbt_nil); \ + return; \ + } \ + /* The node to be pruned is black, so unwind until balance is */\ + /* restored. */\ + pathp->node = &rbtree->rbt_nil; \ + for (pathp--; (uintptr_t)pathp >= (uintptr_t)path; pathp--) { \ + assert(pathp->cmp != 0); \ + if (pathp->cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp->node, \ + pathp[1].node); \ + assert(rbtn_red_get(a_type, a_field, pathp[1].node) \ + == false); \ + if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + a_type *right = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + a_type *rightleft = rbtn_left_get(a_type, a_field, \ + right); \ + a_type *tnode; \ + if (rbtn_red_get(a_type, a_field, rightleft)) { \ + /* In the following diagrams, ||, //, and \\ */\ + /* indicate the path to the removed node. */\ + /* */\ + /* || */\ + /* pathp(r) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + /* */\ + rbtn_black_set(a_type, a_field, pathp->node); \ + rbtn_rotate_right(a_type, a_field, right, tnode); \ + rbtn_right_set(a_type, a_field, pathp->node, tnode);\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + } else { \ + /* || */\ + /* pathp(r) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + /* */\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + } \ + /* Balance restored, but rotation modified subtree */\ + /* root. */\ + assert((uintptr_t)pathp > (uintptr_t)path); \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + return; \ + } else { \ + a_type *right = rbtn_right_get(a_type, a_field, \ + pathp->node); \ + a_type *rightleft = rbtn_left_get(a_type, a_field, \ + right); \ + if (rbtn_red_get(a_type, a_field, rightleft)) { \ + /* || */\ + /* pathp(b) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, rightleft); \ + rbtn_rotate_right(a_type, a_field, right, tnode); \ + rbtn_right_set(a_type, a_field, pathp->node, tnode);\ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + /* Balance restored, but rotation modified */\ + /* subree root, which may actually be the tree */\ + /* root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* // \ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + a_type *tnode; \ + rbtn_red_set(a_type, a_field, pathp->node); \ + rbtn_rotate_left(a_type, a_field, pathp->node, \ + tnode); \ + pathp->node = tnode; \ + } \ + } \ + } else { \ + a_type *left; \ + rbtn_right_set(a_type, a_field, pathp->node, \ + pathp[1].node); \ + left = rbtn_left_get(a_type, a_field, pathp->node); \ + if (rbtn_red_get(a_type, a_field, left)) { \ + a_type *tnode; \ + a_type *leftright = rbtn_right_get(a_type, a_field, \ + left); \ + a_type *leftrightleft = rbtn_left_get(a_type, a_field, \ + leftright); \ + if (rbtn_red_get(a_type, a_field, leftrightleft)) { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (r) (b) */\ + /* \ */\ + /* (b) */\ + /* / */\ + /* (r) */\ + a_type *unode; \ + rbtn_black_set(a_type, a_field, leftrightleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + unode); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + rbtn_right_set(a_type, a_field, unode, tnode); \ + rbtn_rotate_left(a_type, a_field, unode, tnode); \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (r) (b) */\ + /* \ */\ + /* (b) */\ + /* / */\ + /* (b) */\ + assert(leftright != &rbtree->rbt_nil); \ + rbtn_red_set(a_type, a_field, leftright); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + rbtn_black_set(a_type, a_field, tnode); \ + } \ + /* Balance restored, but rotation modified subtree */\ + /* root, which may actually be the tree root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + } \ + return; \ + } else if (rbtn_red_get(a_type, a_field, pathp->node)) { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (rbtn_red_get(a_type, a_field, leftleft)) { \ + /* || */\ + /* pathp(r) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, pathp->node); \ + rbtn_red_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + /* Balance restored, but rotation modified */\ + /* subtree root. */\ + assert((uintptr_t)pathp > (uintptr_t)path); \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, pathp[-1].node, \ + tnode); \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(r) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + rbtn_red_set(a_type, a_field, left); \ + rbtn_black_set(a_type, a_field, pathp->node); \ + /* Balance restored. */ \ + return; \ + } \ + } else { \ + a_type *leftleft = rbtn_left_get(a_type, a_field, left);\ + if (rbtn_red_get(a_type, a_field, leftleft)) { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (r) */\ + a_type *tnode; \ + rbtn_black_set(a_type, a_field, leftleft); \ + rbtn_rotate_right(a_type, a_field, pathp->node, \ + tnode); \ + /* Balance restored, but rotation modified */\ + /* subtree root, which may actually be the tree */\ + /* root. */\ + if (pathp == path) { \ + /* Set root. */ \ + rbtree->rbt_root = tnode; \ + } else { \ + if (pathp[-1].cmp < 0) { \ + rbtn_left_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } else { \ + rbtn_right_set(a_type, a_field, \ + pathp[-1].node, tnode); \ + } \ + } \ + return; \ + } else { \ + /* || */\ + /* pathp(b) */\ + /* / \\ */\ + /* (b) (b) */\ + /* / */\ + /* (b) */\ + rbtn_red_set(a_type, a_field, left); \ + } \ + } \ + } \ + } \ + /* Set root. */ \ + rbtree->rbt_root = path->node; \ + assert(rbtn_red_get(a_type, a_field, rbtree->rbt_root) == false); \ +}/* \ +a_attr a_type * \ +a_prefix##iter_recurse(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + if (node == &rbtree->rbt_nil) { \ + return (&rbtree->rbt_nil); \ + } else { \ + a_type *ret; \ + if ((ret = a_prefix##iter_recurse(rbtree, rbtn_left_get(a_type, \ + a_field, node), cb, arg)) != &rbtree->rbt_nil \ + || (ret = cb(rbtree, node, arg)) != NULL) { \ + return (ret); \ + } \ + return (a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg)); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter_start(a_rbt_type *rbtree, a_type *start, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + int cmp = a_cmp(start, node); \ + if (cmp < 0) { \ + a_type *ret; \ + if ((ret = a_prefix##iter_start(rbtree, start, \ + rbtn_left_get(a_type, a_field, node), cb, arg)) != \ + &rbtree->rbt_nil || (ret = cb(rbtree, node, arg)) != NULL) { \ + return (ret); \ + } \ + return (a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg)); \ + } else if (cmp > 0) { \ + return (a_prefix##iter_start(rbtree, start, \ + rbtn_right_get(a_type, a_field, node), cb, arg)); \ + } else { \ + a_type *ret; \ + if ((ret = cb(rbtree, node, arg)) != NULL) { \ + return (ret); \ + } \ + return (a_prefix##iter_recurse(rbtree, rbtn_right_get(a_type, \ + a_field, node), cb, arg)); \ + } \ +} \ +a_attr a_type * \ +a_prefix##iter(a_rbt_type *rbtree, a_type *start, a_type *(*cb)( \ + a_rbt_type *, a_type *, void *), void *arg) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##iter_start(rbtree, start, rbtree->rbt_root, \ + cb, arg); \ + } else { \ + ret = a_prefix##iter_recurse(rbtree, rbtree->rbt_root, cb, arg);\ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = NULL; \ + } \ + return (ret); \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_recurse(a_rbt_type *rbtree, a_type *node, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + if (node == &rbtree->rbt_nil) { \ + return (&rbtree->rbt_nil); \ + } else { \ + a_type *ret; \ + if ((ret = a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_right_get(a_type, a_field, node), cb, arg)) != \ + &rbtree->rbt_nil || (ret = cb(rbtree, node, arg)) != NULL) { \ + return (ret); \ + } \ + return (a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg)); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter_start(a_rbt_type *rbtree, a_type *start, \ + a_type *node, a_type *(*cb)(a_rbt_type *, a_type *, void *), \ + void *arg) { \ + int cmp = a_cmp(start, node); \ + if (cmp > 0) { \ + a_type *ret; \ + if ((ret = a_prefix##reverse_iter_start(rbtree, start, \ + rbtn_right_get(a_type, a_field, node), cb, arg)) != \ + &rbtree->rbt_nil || (ret = cb(rbtree, node, arg)) != NULL) { \ + return (ret); \ + } \ + return (a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg)); \ + } else if (cmp < 0) { \ + return (a_prefix##reverse_iter_start(rbtree, start, \ + rbtn_left_get(a_type, a_field, node), cb, arg)); \ + } else { \ + a_type *ret; \ + if ((ret = cb(rbtree, node, arg)) != NULL) { \ + return (ret); \ + } \ + return (a_prefix##reverse_iter_recurse(rbtree, \ + rbtn_left_get(a_type, a_field, node), cb, arg)); \ + } \ +} \ +a_attr a_type * \ +a_prefix##reverse_iter(a_rbt_type *rbtree, a_type *start, \ + a_type *(*cb)(a_rbt_type *, a_type *, void *), void *arg) { \ + a_type *ret; \ + if (start != NULL) { \ + ret = a_prefix##reverse_iter_start(rbtree, start, \ + rbtree->rbt_root, cb, arg); \ + } else { \ + ret = a_prefix##reverse_iter_recurse(rbtree, rbtree->rbt_root, \ + cb, arg); \ + } \ + if (ret == &rbtree->rbt_nil) { \ + ret = NULL; \ + } \ + return (ret); \ +}*/ + +#endif /* RB_H_ */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2config/db_config.h b/plugins/Dbx_kv/src/hamsterdb/src/2config/db_config.h new file mode 100644 index 0000000000..77f63944ef --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2config/db_config.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The configuration settings of a Database. + * + * @exception_safe nothrow + * @thread_safe no + */ + +#ifndef HAM_DB_CONFIG_H +#define HAM_DB_CONFIG_H + +#include "0root/root.h" + +#include <ham/types.h> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct DatabaseConfiguration +{ + // Constructor initializes with default values + DatabaseConfiguration() + : db_name(0), flags(0), key_type(HAM_TYPE_BINARY), + key_size(HAM_KEY_SIZE_UNLIMITED), record_size(HAM_RECORD_SIZE_UNLIMITED), + key_compressor(0), record_compressor(0) { + } + + // the database name + uint16_t db_name; + + // the database flags + uint32_t flags; + + // the key type + int key_type; + + // the key size (if specified) + size_t key_size; + + // the record size (if specified) + size_t record_size; + + // the algorithm for key compression + int key_compressor; + + // the algorithm for record compression + int record_compressor; + +}; + +} // namespace hamsterdb + +#endif // HAM_DB_CONFIG_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2config/env_config.h b/plugins/Dbx_kv/src/hamsterdb/src/2config/env_config.h new file mode 100644 index 0000000000..9db5de4771 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2config/env_config.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The configuration settings of an Environment. + * + * @exception_safe nothrow + * @thread_safe no + */ + +#ifndef HAM_ENV_CONFIG_H +#define HAM_ENV_CONFIG_H + +#include "0root/root.h" + +#include <string> +#include <limits> + +#include <ham/hamsterdb.h> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +#undef max + +namespace hamsterdb { + +struct EnvironmentConfiguration +{ + // Constructor initializes with default values + EnvironmentConfiguration() + : flags(0), file_mode(0644), max_databases(0), + page_size_bytes(HAM_DEFAULT_PAGE_SIZE), + cache_size_bytes(HAM_DEFAULT_CACHE_SIZE), + file_size_limit_bytes(std::numeric_limits<size_t>::max()), + remote_timeout_sec(0), journal_compressor(0), + is_encryption_enabled(false), journal_switch_threshold(0), + posix_advice(HAM_POSIX_FADVICE_NORMAL) { + } + + // the environment's flags + uint32_t flags; + + // the file mode + int file_mode; + + // the number of databases + int max_databases; + + // the page size (in bytes) + size_t page_size_bytes; + + // the cache size (in bytes) + uint64_t cache_size_bytes; + + // the file size limit (in bytes) + size_t file_size_limit_bytes; + + // the remote timeout (in seconds) + size_t remote_timeout_sec; + + // the path (or remote location) + std::string filename; + + // the path of the logfile + std::string log_filename; + + // the algorithm for journal compression + int journal_compressor; + + // true if AES encryption is enabled + bool is_encryption_enabled; + + // the AES encryption key + uint8_t encryption_key[16]; + + // threshold for switching journal files + size_t journal_switch_threshold; + + // parameter for posix_fadvise() + int posix_advice; +}; + +} // namespace hamsterdb + +#endif // HAM_ENV_CONFIG_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2device/device.h b/plugins/Dbx_kv/src/hamsterdb/src/2device/device.h new file mode 100644 index 0000000000..7550fad06a --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2device/device.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Device management; a device encapsulates the physical device, either a + * file or memory chunks (for in-memory-databases) + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_DEVICE_H +#define HAM_DEVICE_H + +#include "0root/root.h" + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! +#include "2config/env_config.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Page; + +class Device { + public: + // Constructor + Device(const EnvironmentConfiguration &config) + : m_config(config) { + } + + // virtual destructor + virtual ~Device() { + } + + // Returns the current page size + size_t page_size() const { + return (m_config.page_size_bytes); + } + + // Create a new device - called in ham_env_create + virtual void create() = 0; + + // Opens an existing device - called in ham_env_open + virtual void open() = 0; + + // Returns true if the device is open + virtual bool is_open() = 0; + + // Closes the device - called in ham_env_close + virtual void close() = 0; + + // Flushes the device - called in ham_env_flush + virtual void flush() = 0; + + // Truncate/resize the device + virtual void truncate(uint64_t new_size) = 0; + + // Returns the current file/storage size + virtual uint64_t file_size() = 0; + + // Seek position in a file + virtual void seek(uint64_t offset, int whence) = 0; + + // Tell the position in a file + virtual uint64_t tell() = 0; + + // Reads from the device; this function does not use mmap + virtual void read(uint64_t offset, void *buffer, size_t len) = 0; + + // Writes to the device; this function does not use mmap + virtual void write(uint64_t offset, void *buffer, size_t len) = 0; + + // Allocate storage from this device; this function + // will *NOT* use mmap. returns the offset of the allocated storage. + virtual uint64_t alloc(size_t len) = 0; + + // Reads a page from the device; this function CAN use mmap + virtual void read_page(Page *page, uint64_t address) = 0; + + // Writes a page to the device + virtual void write_page(Page *page) = 0; + + // Allocate storage for a page from this device; this function + // can use mmap if available + virtual void alloc_page(Page *page) = 0; + + // Frees a page on the device. + // The caller is responsible for flushing the page; the @ref free_page + // function will assert that the page is not dirty. + virtual void free_page(Page *page) = 0; + + // Returns true if the specified range is in mapped memory + virtual bool is_mapped(uint64_t file_offset, size_t size) const = 0; + + protected: + // the Environment configuration settings + const EnvironmentConfiguration &m_config; + + friend class DeviceTest; + friend class InMemoryDeviceTest; +}; + +} // namespace hamsterdb + +#endif /* HAM_DEVICE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2device/device_disk.h b/plugins/Dbx_kv/src/hamsterdb/src/2device/device_disk.h new file mode 100644 index 0000000000..1bd62a904e --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2device/device_disk.h @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Device-implementation for disk-based files. Exception safety is "strong" + * for most operations, but currently it's possible that the Page is modified + * if DiskDevice::read_page fails in the middle. + * + * @exception_safe: basic/strong + * @thread_safe: no + */ + +#ifndef HAM_DEVICE_DISK_H +#define HAM_DEVICE_DISK_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/file.h" +#include "1mem/mem.h" +#include "2device/device.h" +#include "2page/page.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* + * a File-based device + */ +class DiskDevice : public Device { + struct State { + // the database file + File file; + + // pointer to the the mmapped data + uint8_t *mmapptr; + + // the size of mmapptr as used in mmap + uint64_t mapped_size; + + // the (cached) size of the file + uint64_t file_size; + }; + + public: + DiskDevice(const EnvironmentConfiguration &config) + : Device(config) { + State state; + state.mmapptr = 0; + state.mapped_size = 0; + state.file_size = 0; + std::swap(m_state, state); + } + + // Create a new device + virtual void create() { + File file; + file.create(m_config.filename.c_str(), m_config.file_mode); + file.set_posix_advice(m_config.posix_advice); + m_state.file = file; + } + + // opens an existing device + // + // tries to map the file; if it fails then continue with read/write + virtual void open() { + bool read_only = (m_config.flags & HAM_READ_ONLY) != 0; + + State state = m_state; + state.file.open(m_config.filename.c_str(), read_only); + state.file.set_posix_advice(m_config.posix_advice); + + // the file size which backs the mapped ptr + state.file_size = state.file.get_file_size(); + + if (m_config.flags & HAM_DISABLE_MMAP) { + std::swap(m_state, state); + return; + } + + // make sure we do not exceed the "real" size of the file, otherwise + // we crash when accessing memory which exceeds the mapping (at least + // on Win32) + size_t granularity = File::get_granularity(); + if (state.file_size == 0 || state.file_size % granularity) { + std::swap(m_state, state); + return; + } + + state.mapped_size = state.file_size; + state.file.mmap(0, state.mapped_size, read_only, &state.mmapptr); + std::swap(m_state, state); + } + + // returns true if the device is open + virtual bool is_open() { + return (m_state.file.is_open()); + } + + // closes the device + virtual void close() { + State state = m_state; + if (state.mmapptr) + state.file.munmap(state.mmapptr, state.mapped_size); + state.file.close(); + + std::swap(m_state, state); + } + + // flushes the device + virtual void flush() { + m_state.file.flush(); + } + + // truncate/resize the device + virtual void truncate(uint64_t new_file_size) { + if (new_file_size > m_config.file_size_limit_bytes) + throw Exception(HAM_LIMITS_REACHED); + m_state.file.truncate(new_file_size); + m_state.file_size = new_file_size; + } + + // get the current file/storage size + virtual uint64_t file_size() { + ham_assert(m_state.file_size == m_state.file.get_file_size()); + return (m_state.file_size); + } + + // seek to a position in a file + virtual void seek(uint64_t offset, int whence) { + m_state.file.seek(offset, whence); + } + + // tell the position in a file + virtual uint64_t tell() { + return (m_state.file.tell()); + } + + // reads from the device; this function does NOT use mmap + virtual void read(uint64_t offset, void *buffer, size_t len) { + m_state.file.pread(offset, buffer, len); + } + + // writes to the device; this function does not use mmap, + // and is responsible for writing the data is run through the file + // filters + virtual void write(uint64_t offset, void *buffer, size_t len) { + m_state.file.pwrite(offset, buffer, len); + } + + // allocate storage from this device; this function + // will *NOT* return mmapped memory + virtual uint64_t alloc(size_t len) { + uint64_t address = m_state.file_size; + truncate(address + len); + return ((uint64_t)address); + } + + // reads a page from the device; this function CAN return a + // pointer to mmapped memory + virtual void read_page(Page *page, uint64_t address) { + // if this page is in the mapped area: return a pointer into that area. + // otherwise fall back to read/write. + if (address < m_state.mapped_size && m_state.mmapptr != 0) { + // ok, this page is mapped. If the Page object has a memory buffer + // then free it; afterwards return a pointer into the mapped memory + page->free_buffer(); + // the following line will not throw a C++ exception, but can + // raise a signal. If that's the case then we don't catch it because + // something is seriously wrong and proper recovery is not possible. + page->assign_mapped_buffer(&m_state.mmapptr[address], address); + return; + } + + // this page is not in the mapped area; allocate a buffer + if (page->get_data() == 0) { + // note that |p| will not leak if file.pread() throws; |p| is stored + // in the |page| object and will be cleaned up by the caller in + // case of an exception. + uint8_t *p = Memory::allocate<uint8_t>(m_config.page_size_bytes); + page->assign_allocated_buffer(p, address); + } + + m_state.file.pread(address, page->get_data(), m_config.page_size_bytes); + } + + // writes a page to the device + virtual void write_page(Page *page) { + write(page->get_address(), page->get_data(), m_config.page_size_bytes); + } + + // Allocates storage for a page from this device; this function + // will *NOT* return mmapped memory + virtual void alloc_page(Page *page) { + uint64_t address = m_state.file_size; + + truncate(address + m_config.page_size_bytes); + page->set_address(address); + + // allocate a memory buffer + uint8_t *p = Memory::allocate<uint8_t>(m_config.page_size_bytes); + page->assign_allocated_buffer(p, address); + } + + // Frees a page on the device; plays counterpoint to |alloc_page| + virtual void free_page(Page *page) { + ham_assert(page->get_data() != 0); + page->free_buffer(); + } + + // Returns true if the specified range is in mapped memory + virtual bool is_mapped(uint64_t file_offset, size_t size) const { + return (file_offset + size <= m_state.mapped_size); + } + + private: + State m_state; +}; + +} // namespace hamsterdb + +#endif /* HAM_DEVICE_DISK_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2device/device_factory.h b/plugins/Dbx_kv/src/hamsterdb/src/2device/device_factory.h new file mode 100644 index 0000000000..7cde29d5af --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2device/device_factory.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A factory for Device objects + * + * @exception_safe: strong + * @thread_safe: yes + */ + +#ifndef HAM_DEVICE_FACTORY_H +#define HAM_DEVICE_FACTORY_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "2config/env_config.h" +#include "2device/device_disk.h" +#include "2device/device_inmem.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct DeviceFactory { + // creates a new Device instance depending on the flags + static Device *create(const EnvironmentConfiguration &config) { + if (config.flags & HAM_IN_MEMORY) + return (new InMemoryDevice(config)); + else + return (new DiskDevice(config)); + } +}; + +} // namespace hamsterdb + +#endif /* HAM_DEVICE_FACTORY_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2device/device_inmem.h b/plugins/Dbx_kv/src/hamsterdb/src/2device/device_inmem.h new file mode 100644 index 0000000000..3e2055148b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2device/device_inmem.h @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: strong + * @thread_safe: no + */ + +#ifndef HAM_DEVICE_INMEM_H +#define HAM_DEVICE_INMEM_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1mem/mem.h" +#include "2device/device.h" +#include "2page/page.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* + * an In-Memory device + */ +class InMemoryDevice : public Device { + struct State { + // flag whether this device was "opened" or is uninitialized + bool is_open; + + // the allocated bytes + uint64_t allocated_size; + }; + + public: + // constructor + InMemoryDevice(const EnvironmentConfiguration &config) + : Device(config) { + State state; + state.is_open = false; + state.allocated_size = 0; + std::swap(m_state, state); + } + + // Create a new device + virtual void create() { + m_state.is_open = true; + } + + // opens an existing device + virtual void open() { + ham_assert(!"can't open an in-memory-device"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // returns true if the device is open + virtual bool is_open() { + return (m_state.is_open); + } + + // closes the device + virtual void close() { + ham_assert(m_state.is_open); + m_state.is_open = false; + } + + // flushes the device + virtual void flush() { + } + + // truncate/resize the device + virtual void truncate(uint64_t newsize) { + } + + // get the current file/storage size + virtual uint64_t file_size() { + ham_assert(!"this operation is not possible for in-memory-databases"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // seek position in a file + virtual void seek(uint64_t offset, int whence) { + ham_assert(!"can't seek in an in-memory-device"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // tell the position in a file + virtual uint64_t tell() { + ham_assert(!"can't tell in an in-memory-device"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // reads from the device; this function does not use mmap + virtual void read(uint64_t offset, void *buffer, size_t len) { + ham_assert(!"operation is not possible for in-memory-databases"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // writes to the device + virtual void write(uint64_t offset, void *buffer, size_t len) { + ham_assert(!"operation is not possible for in-memory-databases"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // reads a page from the device + virtual void read_page(Page *page, uint64_t address) { + ham_assert(!"operation is not possible for in-memory-databases"); + throw Exception(HAM_NOT_IMPLEMENTED); + } + + // writes a page to the device + virtual void write_page(Page *page) { + } + + // allocate storage from this device; this function + // will *NOT* use mmap. + virtual uint64_t alloc(size_t size) { + if (m_state.allocated_size + size > m_config.file_size_limit_bytes) + throw Exception(HAM_LIMITS_REACHED); + + uint64_t retval = (uint64_t)Memory::allocate<uint8_t>(size); + m_state.allocated_size += size; + return (retval); + } + + // allocate storage for a page from this device + virtual void alloc_page(Page *page) { + ham_assert(page->get_data() == 0); + + size_t page_size = m_config.page_size_bytes; + if (m_state.allocated_size + page_size > m_config.file_size_limit_bytes) + throw Exception(HAM_LIMITS_REACHED); + + uint8_t *p = Memory::allocate<uint8_t>(page_size); + page->assign_allocated_buffer(p, (uint64_t)PTR_TO_U64(p)); + + m_state.allocated_size += page_size; + } + + // frees a page on the device; plays counterpoint to @ref alloc_page + virtual void free_page(Page *page) { + page->free_buffer(); + + ham_assert(m_state.allocated_size >= m_config.page_size_bytes); + m_state.allocated_size -= m_config.page_size_bytes; + } + + // Returns true if the specified range is in mapped memory + virtual bool is_mapped(uint64_t file_offset, size_t size) const { + return (false); + } + + // releases a chunk of memory previously allocated with alloc() + void release(void *ptr, size_t size) { + Memory::release(ptr); + ham_assert(m_state.allocated_size >= size); + m_state.allocated_size -= size; + } + + private: + State m_state; +}; + +} // namespace hamsterdb + +#endif /* HAM_DEVICE_INMEM_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2lsn_manager/lsn_manager.h b/plugins/Dbx_kv/src/hamsterdb/src/2lsn_manager/lsn_manager.h new file mode 100644 index 0000000000..3a6be50d44 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2lsn_manager/lsn_manager.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Manager for the log sequence number (lsn) + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_LSN_MANAGER_H +#define HAM_LSN_MANAGER_H + +#include "0root/root.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class LsnManager +{ + public: + // Constructor + LsnManager() + : m_state(1) { + } + + // Returns the next lsn + uint64_t next() { + return (m_state++); + } + + private: + friend struct LsnManagerTest; + + // the actual lsn + uint64_t m_state; +}; + +} // namespace hamsterdb + +#endif /* HAM_LSN_MANAGER_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2lsn_manager/lsn_manager_test.h b/plugins/Dbx_kv/src/hamsterdb/src/2lsn_manager/lsn_manager_test.h new file mode 100644 index 0000000000..59197a66cd --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2lsn_manager/lsn_manager_test.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Test gateway for LsnManager + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_LSN_MANAGER_TEST_H +#define HAM_LSN_MANAGER_TEST_H + +#include "0root/root.h" + +#include "2lsn_manager/lsn_manager.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct LsnManagerTest +{ + // Constructor + LsnManagerTest(LsnManager *lsn_manager) + : m_state(lsn_manager->m_state) { + } + + // Returns the current lsn + uint64_t lsn() const { + return (m_state); + } + + uint64_t &m_state; +}; + +} // namespace hamsterdb + +#endif /* HAM_LSN_MANAGER_TEST_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2page/page.cc b/plugins/Dbx_kv/src/hamsterdb/src/2page/page.cc new file mode 100644 index 0000000000..64558e9370 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2page/page.cc @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +#include "1base/error.h" +#include "1os/os.h" +#include "2page/page.h" +#include "2device/device.h" +#include "3btree/btree_node_proxy.h" + +namespace hamsterdb { + +uint64_t Page::ms_page_count_flushed = 0; + +Page::Page(Device *device, LocalDatabase *db) + : m_device(device), m_db(db), m_address(0), m_is_allocated(false), + m_is_without_header(false), m_is_dirty(false), m_cursor_list(0), + m_node_proxy(0), m_data(0) +{ + memset(&m_prev[0], 0, sizeof(m_prev)); + memset(&m_next[0], 0, sizeof(m_next)); +} + +Page::~Page() +{ + ham_assert(m_cursor_list == 0); + +#ifdef HAM_ENABLE_HELGRIND + // safely unlock the mutex + m_mutex.try_lock(); +#endif + m_mutex.unlock(); + + if (m_node_proxy) { + delete m_node_proxy; + m_node_proxy = 0; + } + + if (m_data != 0) + m_device->free_page(this); +} + +void +Page::alloc(uint32_t type, uint32_t flags) +{ + m_device->alloc_page(this); + + if (flags & kInitializeWithZeroes) { + size_t page_size = m_device->page_size(); + memset(get_raw_payload(), 0, page_size); + } + + if (type) + set_type(type); +} + +void +Page::fetch(uint64_t address) +{ + m_device->read_page(this, address); + set_address(address); +} + +void +Page::flush() +{ + if (is_dirty()) { + m_device->write_page(this); + set_dirty(false); + ms_page_count_flushed++; + } +} + +void +Page::free_buffer() +{ + if (m_node_proxy) { + delete m_node_proxy; + m_node_proxy = 0; + } + + if (m_is_allocated) + Memory::release(m_data); + m_data = 0; +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2page/page.h b/plugins/Dbx_kv/src/hamsterdb/src/2page/page.h new file mode 100644 index 0000000000..f68edc474b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2page/page.h @@ -0,0 +1,435 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: strong + * @thread_safe: no + */ + +#ifndef HAM_PAGE_H +#define HAM_PAGE_H + +#include <string.h> +#include <boost/atomic.hpp> + +#include "1base/error.h" +#include "1base/spinlock.h" +#include "1mem/mem.h" + +namespace hamsterdb { + +class Device; +class BtreeCursor; +class BtreeNodeProxy; +class LocalDatabase; + +#include "1base/packstart.h" + +/* + * This header is only available if the (non-persistent) flag + * kNpersNoHeader is not set! Blob pages do not have this header. + */ +typedef HAM_PACK_0 struct HAM_PACK_1 PPageHeader { + // flags of this page - currently only used for the Page::kType* codes + uint32_t flags; + + // reserved + uint32_t reserved; + + // the lsn of the last operation + uint64_t lsn; + + // the persistent data blob + uint8_t payload[1]; + +} HAM_PACK_2 PPageHeader; + +#include "1base/packstop.h" + +#include "1base/packstart.h" + +/* + * A union combining the page header and a pointer to the raw page data. + * + * This structure definition is present outside of @ref Page scope + * to allow compile-time OFFSETOF macros to correctly judge the size, + * depending on platform and compiler settings. + */ +typedef HAM_PACK_0 union HAM_PACK_1 PPageData { + // the persistent header + struct PPageHeader header; + + // a char pointer to the allocated storage on disk + uint8_t payload[1]; + +} HAM_PACK_2 PPageData; + +#include "1base/packstop.h" + +/* + * The Page class + * + * Each Page instance is a node in several linked lists. + * In order to avoid multiple memory allocations, the previous/next pointers + * are part of the Page class (m_prev and m_next). Both fields are arrays + * of pointers and can be used i.e. with m_prev[Page::kListBucket] etc. + * (or with the methods defined below). + */ +class Page { + public: + // Misc. enums + enum { + // sizeof the persistent page header + kSizeofPersistentHeader = sizeof(PPageHeader) - 1, + + // instruct Page::alloc() to reset the page with zeroes + kInitializeWithZeroes, + }; + + // The various linked lists (indices in m_prev, m_next) + enum { + // list of all cached pages + kListCache = 0, + + // list of all pages in a changeset + kListChangeset = 1, + + // a bucket in the hash table of the cache + kListBucket = 2, + + // array limit + kListMax = 3 + }; + + // non-persistent page flags + enum { + // page->m_data was allocated with malloc, not mmap + kNpersMalloc = 1, + + // page has no header (i.e. it's part of a large blob) + kNpersNoHeader = 2 + }; + + // Page types + // + // When large BLOBs span multiple pages, only their initial page + // will have a valid type code; subsequent pages of this blog will store + // the data as-is, so as to provide one continuous storage space + enum { + // unidentified db page type + kTypeUnknown = 0x00000000, + + // the header page: this is the first page in the environment (offset 0) + kTypeHeader = 0x10000000, + + // a B+tree root page + kTypeBroot = 0x20000000, + + // a B+tree node page + kTypeBindex = 0x30000000, + + // a page storing the state of the PageManager + kTypePageManager = 0x40000000, + + // a page which stores blobs + kTypeBlob = 0x50000000 + }; + + // Default constructor + Page(Device *device, LocalDatabase *db = 0); + + // Destructor - releases allocated memory and resources, but neither + // flushes dirty pages to disk nor moves them to the freelist! + // Asserts that no cursors are attached. + ~Page(); + + // Returns the size of the usable persistent payload of a page + // (page_size minus the overhead of the page header) + static uint32_t usable_page_size(uint32_t raw_page_size) { + return (raw_page_size - Page::kSizeofPersistentHeader); + } + + + // Returns the database which manages this page; can be NULL if this + // page belongs to the Environment (i.e. for freelist-pages) + LocalDatabase *get_db() { + return (m_db); + } + + // Sets the database to which this Page belongs + void set_db(LocalDatabase *db) { + m_db = db; + } + + // Returns the spinlock + Spinlock &mutex() { + return (m_mutex); + } + + // Returns the device + Device *device() { + return (m_device); + } + + // Returns true if this is the header page of the Environment + bool is_header() const { + return (m_address == 0); + } + + // Returns the address of this page + uint64_t get_address() const { + return (m_address); + } + + // Sets the address of this page + void set_address(uint64_t address) { + m_address = address; + } + + // Returns true if this page is dirty (and needs to be flushed to disk) + bool is_dirty() const { + return (m_is_dirty); + } + + // Sets this page dirty/not dirty + void set_dirty(bool dirty) { + m_is_dirty = dirty; + } + + // Returns true if the page's buffer was allocated with malloc + bool is_allocated() const { + return (m_is_allocated); + } + + // Returns true if the page has no persistent header + bool is_without_header() const { + return (m_is_without_header); + } + + // Sets a flag whether the page has no persistent header + void set_without_header(bool without_header) { + m_is_without_header = without_header; + } + + // Assign a buffer which was allocated with malloc() + void assign_allocated_buffer(void *buffer, uint64_t address) { + m_data = (PPageData *)buffer; + m_is_allocated = true; + m_address = address; + } + + // Assign a buffer from mmapped storage + void assign_mapped_buffer(void *buffer, uint64_t address) { + m_data = (PPageData *)buffer; + m_is_allocated = false; + m_address = address; + } + + // Free resources associated with the buffer + void free_buffer(); + + // Returns the linked list of coupled cursors (can be NULL) + BtreeCursor *cursor_list() { + return (m_cursor_list); + } + + // Sets the (head of the) linked list of cursors + void set_cursor_list(BtreeCursor *cursor) { + m_cursor_list = cursor; + } + + // Returns the page's type (kType*) + uint32_t get_type() const { + return (m_data->header.flags); + } + + // Sets the page's type (kType*) + void set_type(uint32_t type) { + m_data->header.flags = type; + } + + // Returns the lsn of the last modification + uint64_t get_lsn() const { + return (m_data->header.lsn); + } + + // Sets the lsn of the last modification + void set_lsn(uint64_t lsn) { + m_data->header.lsn = lsn; + } + + // Sets the pointer to the persistent data + void set_data(PPageData *data) { + m_data = data; + } + + // Returns the pointer to the persistent data + PPageData *get_data() { + return (m_data); + } + + // Returns the persistent payload (after the header!) + uint8_t *get_payload() { + return (m_data->header.payload); + } + + // Returns the persistent payload (after the header!) + const uint8_t *get_payload() const { + return (m_data->header.payload); + } + + // Returns the persistent payload (including the header!) + uint8_t *get_raw_payload() { + return (m_data->payload); + } + + // Returns the persistent payload (including the header!) + const uint8_t *get_raw_payload() const { + return (m_data->payload); + } + + // Allocates a new page from the device + // |flags|: either 0 or kInitializeWithZeroes + void alloc(uint32_t type, uint32_t flags = 0); + + // Reads a page from the device + void fetch(uint64_t address); + + // Writes the page to the device + void flush(); + + // Returns true if this page is in a linked list + bool is_in_list(Page *list_head, int list) { + if (get_next(list) != 0) + return (true); + if (get_previous(list) != 0) + return (true); + return (list_head == this); + } + + // Inserts this page at the beginning of a list and returns the + // new head of the list + Page *list_insert(Page *list_head, int list) { + set_next(list, 0); + set_previous(list, 0); + + if (!list_head) + return (this); + + set_next(list, list_head); + list_head->set_previous(list, this); + return (this); + } + + // Removes this page from a list and returns the new head of the list + Page *list_remove(Page *list_head, int list) { + Page *n, *p; + + if (this == list_head) { + n = get_next(list); + if (n) + n->set_previous(list, 0); + set_next(list, 0); + set_previous(list, 0); + return (n); + } + + n = get_next(list); + p = get_previous(list); + if (p) + p->set_next(list, n); + if (n) + n->set_previous(list, p); + set_next(list, 0); + set_previous(list, 0); + return (list_head); + } + + // Returns the next page in a linked list + Page *get_next(int list) { + return (m_next[list]); + } + + // Returns the previous page of a linked list + Page *get_previous(int list) { + return (m_prev[list]); + } + + // Returns the cached BtreeNodeProxy + BtreeNodeProxy *get_node_proxy() { + return (m_node_proxy); + } + + // Sets the cached BtreeNodeProxy + void set_node_proxy(BtreeNodeProxy *proxy) { + m_node_proxy = proxy; + } + + // tracks number of flushed pages + static uint64_t ms_page_count_flushed; + + private: + friend class PageCollection; + + // Sets the previous page of a linked list + void set_previous(int list, Page *other) { + m_prev[list] = other; + } + + // Sets the next page in a linked list + void set_next(int list, Page *other) { + m_next[list] = other; + } + + // the Device for allocating storage + Device *m_device; + + // the Database handle (can be NULL) + LocalDatabase *m_db; + + // The spinlock is locked if the page is in use or written to disk + Spinlock m_mutex; + + // address of this page + uint64_t m_address; + + // Page buffer was allocated with malloc() (if not then it was mapped + // with mmap) + bool m_is_allocated; + + // Page does not have a persistent header + bool m_is_without_header; + + // is this page dirty and needs to be flushed to disk? + bool m_is_dirty; + + // linked list of all cursors which point to that page + BtreeCursor *m_cursor_list; + + // linked lists of pages - see comments above + Page *m_prev[Page::kListMax]; + Page *m_next[Page::kListMax]; + + // the cached BtreeNodeProxy object + BtreeNodeProxy *m_node_proxy; + + // the persistent data of this page + PPageData *m_data; +}; + +} // namespace hamsterdb + +#endif /* HAM_PAGE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2page/page_collection.h b/plugins/Dbx_kv/src/hamsterdb/src/2page/page_collection.h new file mode 100644 index 0000000000..b396c78165 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2page/page_collection.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: strong + * @thread_safe: no + */ + +#ifndef HAM_PAGE_COLLECTION_H +#define HAM_PAGE_COLLECTION_H + +#include <string.h> + +#include <boost/atomic.hpp> + +#include "1mem/mem.h" +#include "2page/page.h" + +namespace hamsterdb { + +/* + * The PageCollection class + */ +class PageCollection { + public: + // Default constructor + PageCollection(int list_id) + : m_head(0), m_tail(0), m_size(0), m_id(list_id) { + } + + // Destructor + ~PageCollection() { + clear(); + } + + bool is_empty() const { + return (m_size == 0); + } + + int size() const { + return (m_size); + } + + // Atomically applies the |visitor()| to each page + template<typename Visitor> + void for_each(Visitor &visitor) { + for (Page *p = m_head; p != 0; p = p->get_next(m_id)) { + if (!visitor(p)) + break; + } + } + + // Atomically applies the |visitor()| to each page; starts at the tail + template<typename Visitor> + void for_each_reverse(Visitor &visitor) { + for (Page *p = m_tail; p != 0; p = p->get_previous(m_id)) { + if (!visitor(p)) + break; + } + } + + // Same as |for_each()|, but removes the page if |visitor()| returns true + template<typename Visitor> + void extract(Visitor &visitor) { + Page *page = m_head; + while (page) { + Page *next = page->get_next(m_id); + if (visitor(page)) { + del_impl(page); + } + page = next; + } + } + + // Clears the collection. + void clear() { + Page *page = m_head; + while (page) { + Page *next = page->get_next(m_id); + del_impl(page); + page = next; + } + + ham_assert(m_head == 0); + ham_assert(m_tail == 0); + ham_assert(m_size == 0); + } + + // Returns the head + Page *head() const { + return (m_head); + } + + // Returns the tail + Page *tail() const { + return (m_tail); + } + + // Returns a page from the collection + Page *get(uint64_t address) const { + for (Page *p = m_head; p != 0; p = p->get_next(m_id)) { + if (p->get_address() == address) + return (p); + } + return (0); + } + + // Removes a page from the collection. Returns true if the page was removed, + // otherwise false (if the page was not in the list) + bool del(Page *page) { + if (has(page)) { + del_impl(page); + return (true); + } + return (false); + } + + // Adds a new page at the head of the list. Returns true if the page was + // added, otherwise false (that's the case if the page is already part of + // the list) + bool put(Page *page) { + if (!has(page)) { + m_head = page->list_insert(m_head, m_id); + if (!m_tail) + m_tail = page; + ++m_size; + return (true); + } + return (false); + } + + // Returns true if a page with the |address| is already stored. + bool has(uint64_t address) const { + return (get(address) != 0); + } + + // Returns true if the |page| is already stored. This is much faster + // than has(uint64_t address). + bool has(Page *page) const { + return (page->is_in_list(m_head, m_id)); + } + + private: + void del_impl(Page *page) { + // First update the tail because Page::list_remove() will change the + // pointers! + if (m_tail == page) + m_tail = page->get_previous(m_id); + m_head = page->list_remove(m_head, m_id); + ham_assert(m_size > 0); + --m_size; + } + + // The head of the linked list + Page *m_head; + + // The tail of the linked list + Page *m_tail; + + // Number of elements in the list + int m_size; + + // The list ID + int m_id; +}; + +} // namespace hamsterdb + +#endif /* HAM_PAGE_COLLECTION_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/Makefile.am b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/Makefile.am new file mode 100644 index 0000000000..b5c5c881f4 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/Makefile.am @@ -0,0 +1,15 @@ + +AM_CPPFLAGS = -DHAM_ENABLE_REMOTE -I$(top_builddir)/include + +# INCLUDES = + +noinst_LTLIBRARIES = libprotocol.la + +nodist_libprotocol_la_SOURCES = messages.pb.cc +libprotocol_la_SOURCES = protocol.h +libprotocol_la_LIBADD = -lprotobuf + +EXTRA_DIST = messages.proto +messages.pb.cc proto: $(srcdir)/messages.proto + protoc $(srcdir)/messages.proto --cpp_out=. + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/Makefile.in b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/Makefile.in new file mode 100644 index 0000000000..e198a11d7d --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/Makefile.in @@ -0,0 +1,627 @@ +# Makefile.in generated by automake 1.14.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2013 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +VPATH = @srcdir@ +am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src/2protobuf +DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \ + $(top_srcdir)/depcomp +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/ax_cxx_gcc_abi_demangle.m4 \ + $(top_srcdir)/m4/boost.m4 $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +LTLIBRARIES = $(noinst_LTLIBRARIES) +libprotocol_la_DEPENDENCIES = +am_libprotocol_la_OBJECTS = +nodist_libprotocol_la_OBJECTS = messages.pb.lo +libprotocol_la_OBJECTS = $(am_libprotocol_la_OBJECTS) \ + $(nodist_libprotocol_la_OBJECTS) +AM_V_lt = $(am__v_lt_@AM_V@) +am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) +am__v_lt_0 = --silent +am__v_lt_1 = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir) +depcomp = $(SHELL) $(top_srcdir)/depcomp +am__depfiles_maybe = depfiles +am__mv = mv -f +CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \ + $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) +LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CXXFLAGS) $(CXXFLAGS) +AM_V_CXX = $(am__v_CXX_@AM_V@) +am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@) +am__v_CXX_0 = @echo " CXX " $@; +am__v_CXX_1 = +CXXLD = $(CXX) +CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CXXLD = $(am__v_CXXLD_@AM_V@) +am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@) +am__v_CXXLD_0 = @echo " CXXLD " $@; +am__v_CXXLD_1 = +COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ + $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) +LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \ + $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \ + $(AM_CFLAGS) $(CFLAGS) +AM_V_CC = $(am__v_CC_@AM_V@) +am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@) +am__v_CC_0 = @echo " CC " $@; +am__v_CC_1 = +CCLD = $(CC) +LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ + $(AM_LDFLAGS) $(LDFLAGS) -o $@ +AM_V_CCLD = $(am__v_CCLD_@AM_V@) +am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@) +am__v_CCLD_0 = @echo " CCLD " $@; +am__v_CCLD_1 = +SOURCES = $(libprotocol_la_SOURCES) $(nodist_libprotocol_la_SOURCES) +DIST_SOURCES = $(libprotocol_la_SOURCES) +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +# Read a list of newline-separated strings from the standard input, +# and print each of them once, without duplicates. Input order is +# *not* preserved. +am__uniquify_input = $(AWK) '\ + BEGIN { nonempty = 0; } \ + { items[$$0] = 1; nonempty = 1; } \ + END { if (nonempty) { for (i in items) print i; }; } \ +' +# Make sure the list of sources is unique. This is necessary because, +# e.g., the same source file might be shared among _SOURCES variables +# for different programs/libraries. +am__define_uniq_tagged_files = \ + list='$(am__tagged_files)'; \ + unique=`for i in $$list; do \ + if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \ + done | $(am__uniquify_input)` +ETAGS = etags +CTAGS = ctags +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BOOST_CHRONO_LDFLAGS = @BOOST_CHRONO_LDFLAGS@ +BOOST_CHRONO_LDPATH = @BOOST_CHRONO_LDPATH@ +BOOST_CHRONO_LIBS = @BOOST_CHRONO_LIBS@ +BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ +BOOST_FILESYSTEM_LDFLAGS = @BOOST_FILESYSTEM_LDFLAGS@ +BOOST_FILESYSTEM_LDPATH = @BOOST_FILESYSTEM_LDPATH@ +BOOST_FILESYSTEM_LIBS = @BOOST_FILESYSTEM_LIBS@ +BOOST_LDPATH = @BOOST_LDPATH@ +BOOST_ROOT = @BOOST_ROOT@ +BOOST_SYSTEM_LDFLAGS = @BOOST_SYSTEM_LDFLAGS@ +BOOST_SYSTEM_LDPATH = @BOOST_SYSTEM_LDPATH@ +BOOST_SYSTEM_LIBS = @BOOST_SYSTEM_LIBS@ +BOOST_THREAD_LDFLAGS = @BOOST_THREAD_LDFLAGS@ +BOOST_THREAD_LDPATH = @BOOST_THREAD_LDPATH@ +BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@ +BOOST_THREAD_WIN32_LDFLAGS = @BOOST_THREAD_WIN32_LDFLAGS@ +BOOST_THREAD_WIN32_LDPATH = @BOOST_THREAD_WIN32_LDPATH@ +BOOST_THREAD_WIN32_LIBS = @BOOST_THREAD_WIN32_LIBS@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DISTCHECK_CONFIGURE_FLAGS = @DISTCHECK_CONFIGURE_FLAGS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +JDK_INCLUDE = @JDK_INCLUDE@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +AM_CPPFLAGS = -DHAM_ENABLE_REMOTE -I$(top_builddir)/include + +# INCLUDES = +noinst_LTLIBRARIES = libprotocol.la +nodist_libprotocol_la_SOURCES = messages.pb.cc +libprotocol_la_SOURCES = protocol.h +libprotocol_la_LIBADD = -lprotobuf +EXTRA_DIST = messages.proto +all: all-am + +.SUFFIXES: +.SUFFIXES: .cc .lo .o .obj +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/2protobuf/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign src/2protobuf/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +clean-noinstLTLIBRARIES: + -test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES) + @list='$(noinst_LTLIBRARIES)'; \ + locs=`for p in $$list; do echo $$p; done | \ + sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \ + sort -u`; \ + test -z "$$locs" || { \ + echo rm -f $${locs}; \ + rm -f $${locs}; \ + } + +libprotocol.la: $(libprotocol_la_OBJECTS) $(libprotocol_la_DEPENDENCIES) $(EXTRA_libprotocol_la_DEPENDENCIES) + $(AM_V_CXXLD)$(CXXLINK) $(libprotocol_la_OBJECTS) $(libprotocol_la_LIBADD) $(LIBS) + +mostlyclean-compile: + -rm -f *.$(OBJEXT) + +distclean-compile: + -rm -f *.tab.c + +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/messages.pb.Plo@am__quote@ + +.cc.o: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ +@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ +@am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $< + +.cc.obj: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\ +@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\ +@am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'` + +.cc.lo: +@am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\ +@am__fastdepCXX_TRUE@ $(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ +@am__fastdepCXX_TRUE@ $(am__mv) $$depbase.Tpo $$depbase.Plo +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $< + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs + +ID: $(am__tagged_files) + $(am__define_uniq_tagged_files); mkid -fID $$unique +tags: tags-am +TAGS: tags + +tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + set x; \ + here=`pwd`; \ + $(am__define_uniq_tagged_files); \ + shift; \ + if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \ + test -n "$$unique" || unique=$$empty_fix; \ + if test $$# -gt 0; then \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + "$$@" $$unique; \ + else \ + $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \ + $$unique; \ + fi; \ + fi +ctags: ctags-am + +CTAGS: ctags +ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files) + $(am__define_uniq_tagged_files); \ + test -z "$(CTAGS_ARGS)$$unique" \ + || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \ + $$unique + +GTAGS: + here=`$(am__cd) $(top_builddir) && pwd` \ + && $(am__cd) $(top_srcdir) \ + && gtags -i $(GTAGS_ARGS) "$$here" +cscopelist: cscopelist-am + +cscopelist-am: $(am__tagged_files) + list='$(am__tagged_files)'; \ + case "$(srcdir)" in \ + [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \ + *) sdir=$(subdir)/$(srcdir) ;; \ + esac; \ + for i in $$list; do \ + if test -f "$$i"; then \ + echo "$(subdir)/$$i"; \ + else \ + echo "$$sdir/$$i"; \ + fi; \ + done >> $(top_builddir)/cscope.files + +distclean-tags: + -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile $(LTLIBRARIES) +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \ + mostlyclean-am + +distclean: distclean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +distclean-am: clean-am distclean-compile distclean-generic \ + distclean-tags + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -rf ./$(DEPDIR) + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-compile mostlyclean-generic \ + mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \ + clean-libtool clean-noinstLTLIBRARIES cscopelist-am ctags \ + ctags-am distclean distclean-compile distclean-generic \ + distclean-libtool distclean-tags distdir dvi dvi-am html \ + html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-compile \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags tags-am uninstall uninstall-am + +messages.pb.cc proto: $(srcdir)/messages.proto + protoc $(srcdir)/messages.proto --cpp_out=. + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/messages.proto b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/messages.proto new file mode 100644 index 0000000000..f8ec8fdcb6 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/messages.proto @@ -0,0 +1,457 @@ + + +package hamsterdb; + +option optimize_for = LITE_RUNTIME; + +message ProtoWrapper { + enum Type { + CONNECT_REQUEST = 10; + CONNECT_REPLY = 11; + DISCONNECT_REQUEST = 12; + DISCONNECT_REPLY = 13; + ENV_RENAME_REQUEST = 20; + ENV_RENAME_REPLY = 21; + ENV_GET_PARAMETERS_REQUEST = 30; + ENV_GET_PARAMETERS_REPLY = 31; + ENV_GET_DATABASE_NAMES_REQUEST = 40; + ENV_GET_DATABASE_NAMES_REPLY = 41; + ENV_FLUSH_REQUEST = 50; + ENV_FLUSH_REPLY = 51; + ENV_CREATE_DB_REQUEST = 60; + ENV_CREATE_DB_REPLY = 61; + ENV_OPEN_DB_REQUEST = 70; + ENV_OPEN_DB_REPLY = 71; + ENV_ERASE_DB_REQUEST = 80; + ENV_ERASE_DB_REPLY = 81; + DB_CLOSE_REQUEST = 90; + DB_CLOSE_REPLY = 91; + DB_GET_PARAMETERS_REQUEST = 100; + DB_GET_PARAMETERS_REPLY = 101; + // DB_FLUSH_REQUEST = 110; + // DB_FLUSH_REPLY = 111; + TXN_BEGIN_REQUEST = 120; + TXN_BEGIN_REPLY = 121; + TXN_COMMIT_REQUEST = 130; + TXN_COMMIT_REPLY = 131; + TXN_ABORT_REQUEST = 140; + TXN_ABORT_REPLY = 141; + DB_CHECK_INTEGRITY_REQUEST = 150; + DB_CHECK_INTEGRITY_REPLY = 151; + DB_GET_KEY_COUNT_REQUEST = 160; + DB_GET_KEY_COUNT_REPLY = 161; + DB_INSERT_REQUEST = 170; + DB_INSERT_REPLY = 171; + DB_ERASE_REQUEST = 180; + DB_ERASE_REPLY = 181; + DB_FIND_REQUEST = 190; + DB_FIND_REPLY = 191; + CURSOR_CREATE_REQUEST = 200; + CURSOR_CREATE_REPLY = 201; + CURSOR_CLONE_REQUEST = 210; + CURSOR_CLONE_REPLY = 211; + CURSOR_CLOSE_REQUEST = 220; + CURSOR_CLOSE_REPLY = 221; + CURSOR_INSERT_REQUEST = 230; + CURSOR_INSERT_REPLY = 231; + CURSOR_ERASE_REQUEST = 240; + CURSOR_ERASE_REPLY = 241; + CURSOR_GET_RECORD_COUNT_REQUEST = 260; + CURSOR_GET_RECORD_COUNT_REPLY = 261; + CURSOR_GET_DUPLICATE_POSITION_REQUEST = 262; + CURSOR_GET_DUPLICATE_POSITION_REPLY = 263; + CURSOR_GET_RECORD_SIZE_REQUEST = 264; + CURSOR_GET_RECORD_SIZE_REPLY = 265; + CURSOR_OVERWRITE_REQUEST = 270; + CURSOR_OVERWRITE_REPLY = 271; + CURSOR_MOVE_REQUEST = 280; + CURSOR_MOVE_REPLY = 281; + } + + required Type type = 1; + + optional ConnectRequest connect_request = 10; + optional ConnectReply connect_reply = 11; + optional DisconnectRequest disconnect_request = 12; + optional DisconnectReply disconnect_reply = 13; + optional EnvRenameRequest env_rename_request = 20; + optional EnvRenameReply env_rename_reply = 21; + optional EnvGetParametersRequest env_get_parameters_request = 30; + optional EnvGetParametersReply env_get_parameters_reply = 31; + optional EnvGetDatabaseNamesRequest env_get_database_names_request = 40; + optional EnvGetDatabaseNamesReply env_get_database_names_reply = 41; + optional EnvFlushRequest env_flush_request = 50; + optional EnvFlushReply env_flush_reply = 51; + optional EnvCreateDbRequest env_create_db_request = 60; + optional EnvCreateDbReply env_create_db_reply = 61; + optional EnvOpenDbRequest env_open_db_request = 70; + optional EnvOpenDbReply env_open_db_reply = 71; + optional EnvEraseDbRequest env_erase_db_request = 80; + optional EnvEraseDbReply env_erase_db_reply = 81; + optional DbCloseRequest db_close_request = 90; + optional DbCloseReply db_close_reply = 91; + optional DbGetParametersRequest db_get_parameters_request = 100; + optional DbGetParametersReply db_get_parameters_reply = 101; + optional TxnBeginRequest txn_begin_request = 120; + optional TxnBeginReply txn_begin_reply = 121; + optional TxnCommitRequest txn_commit_request = 130; + optional TxnCommitReply txn_commit_reply = 131; + optional TxnAbortRequest txn_abort_request = 140; + optional TxnAbortReply txn_abort_reply = 141; + optional DbCheckIntegrityRequest db_check_integrity_request = 150; + optional DbCheckIntegrityReply db_check_integrity_reply = 151; + optional DbCountRequest db_count_request = 160; + optional DbCountReply db_count_reply = 161; + optional DbInsertRequest db_insert_request = 170; + optional DbInsertReply db_insert_reply = 171; + optional DbEraseRequest db_erase_request = 180; + optional DbEraseReply db_erase_reply = 181; + optional DbFindRequest db_find_request = 190; + optional DbFindReply db_find_reply = 191; + optional CursorCreateRequest cursor_create_request = 200; + optional CursorCreateReply cursor_create_reply = 201; + optional CursorCloneRequest cursor_clone_request = 210; + optional CursorCloneReply cursor_clone_reply = 211; + optional CursorCloseRequest cursor_close_request = 220; + optional CursorCloseReply cursor_close_reply = 221; + optional CursorInsertRequest cursor_insert_request = 230; + optional CursorInsertReply cursor_insert_reply = 231; + optional CursorEraseRequest cursor_erase_request = 240; + optional CursorEraseReply cursor_erase_reply = 241; + optional CursorGetRecordCountRequest cursor_get_record_count_request = 260; + optional CursorGetRecordCountReply cursor_get_record_count_reply = 261; + optional CursorGetDuplicatePositionRequest cursor_get_duplicate_position_request = 262; + optional CursorGetDuplicatePositionReply cursor_get_duplicate_position_reply = 263; + optional CursorGetRecordSizeRequest cursor_get_record_size_request = 264; + optional CursorGetRecordSizeReply cursor_get_record_size_reply = 265; + optional CursorOverwriteRequest cursor_overwrite_request = 270; + optional CursorOverwriteReply cursor_overwrite_reply = 271; + optional CursorMoveRequest cursor_move_request = 280; + optional CursorMoveReply cursor_move_reply = 281; +} + +message ConnectRequest { + required string path = 1; +} + +message ConnectReply { + required sint32 status = 1; + optional uint32 env_flags = 2; + optional uint64 env_handle = 3; +} + +message DisconnectRequest { + required uint64 env_handle = 1; +} + +message DisconnectReply { + required sint32 status = 1; +} + +message EnvGetParametersRequest { + required uint64 env_handle = 1; + repeated uint32 names = 2; +} + +message EnvGetParametersReply { + required sint32 status = 1; + optional uint32 cache_size = 2; + optional uint32 page_size = 3; + optional uint32 max_env_databases = 4; + optional uint32 flags = 5; + optional uint32 filemode = 6; + optional string filename = 7; +}; + +message EnvGetDatabaseNamesRequest { + required uint64 env_handle = 1; +} + +message EnvGetDatabaseNamesReply { + required sint32 status = 1; + repeated uint32 names = 2; +} + +message EnvRenameRequest { + required uint64 env_handle = 1; + required uint32 oldname = 2; + required uint32 newname = 3; + required uint32 flags = 4; +} + +message EnvRenameReply { + required sint32 status = 1; +}; + +message EnvFlushRequest { + required uint64 env_handle = 1; + required uint32 flags = 2; +} + +message EnvFlushReply { + required sint32 status = 1; +}; + +message EnvCreateDbRequest { + required uint64 env_handle = 1; + required uint32 dbname = 2; + required uint32 flags = 3; + repeated uint32 param_names = 4; + repeated uint64 param_values = 5; +} + +message EnvCreateDbReply { + required sint32 status = 1; + optional uint64 db_handle = 2; + optional uint32 db_flags = 3; +}; + +message EnvOpenDbRequest { + required uint64 env_handle = 1; + required uint32 dbname = 2; + required uint32 flags = 3; + repeated uint32 param_names = 4; + repeated uint64 param_values = 5; +} + +message EnvOpenDbReply { + required sint32 status = 1; + optional uint64 db_handle = 2; + optional uint32 db_flags = 3; +}; + +message EnvEraseDbRequest { + required uint64 env_handle = 1; + required uint32 name = 2; + required uint32 flags = 3; +} + +message EnvEraseDbReply { + required sint32 status = 1; +}; + +message DbCloseRequest { + required uint64 db_handle = 1; + required uint32 flags = 2; +} + +message DbCloseReply { + required sint32 status = 1; +}; + +message DbGetParametersRequest { + required uint64 db_handle = 1; + repeated uint32 names = 2; +} + +message DbGetParametersReply { + required sint32 status = 1; + optional uint32 max_env_databases = 2; + optional uint32 flags = 3; + optional uint32 key_size = 4; + optional uint32 dbname = 5; + optional uint32 keys_per_page = 6; + optional uint32 key_type = 7; + optional uint32 record_size = 8; +}; + +message TxnBeginRequest { + required uint64 env_handle = 1; + required uint32 flags = 2; + optional string name = 3; +} + +message TxnBeginReply { + required sint32 status = 1; + required uint64 txn_handle = 2; +}; + +message TxnCommitRequest { + required uint64 txn_handle = 1; + required uint32 flags = 2; +} + +message TxnCommitReply { + required sint32 status = 1; +}; + +message TxnAbortRequest { + required uint64 txn_handle = 1; + required uint32 flags = 2; +} + +message TxnAbortReply { + required sint32 status = 1; +}; + +message DbCheckIntegrityRequest { + required uint64 db_handle = 1; + required uint32 flags = 2; +} + +message DbCheckIntegrityReply { + required sint32 status = 1; +}; + +message DbCountRequest { + required uint64 db_handle = 1; + required uint64 txn_handle = 2; + required bool distinct = 3; +}; + +message DbCountReply { + required sint32 status = 1; + required uint64 keycount = 2; +}; + +message Key { + optional bytes data = 1; + required uint32 flags = 2; + required uint32 intflags = 3; +} + +message Record { + optional bytes data = 1; + required uint32 flags = 2; + required uint32 partial_offset = 3; + required uint32 partial_size = 4; +} + +message DbInsertRequest { + required uint64 db_handle = 1; + required uint64 txn_handle = 2; + optional Key key = 3; + optional Record record = 4; + required uint32 flags = 5; +}; + +message DbInsertReply { + required sint32 status = 1; + optional Key key = 2; +}; + +message DbEraseRequest { + required uint64 db_handle = 1; + required uint64 txn_handle = 2; + required Key key = 3; + required uint32 flags = 4; +}; + +message DbEraseReply { + required sint32 status = 1; +}; + +message DbFindRequest { + required uint64 db_handle = 1; + required uint64 txn_handle = 2; + required uint64 cursor_handle = 3; + required Key key = 4; + optional Record record = 5; + required uint32 flags = 6; +}; + +message DbFindReply { + required sint32 status = 1; + required Record record = 2; + optional Key key = 3; +}; + +message CursorCreateRequest { + required uint64 db_handle = 1; + required uint64 txn_handle = 2; + required uint32 flags = 3; +}; + +message CursorCreateReply { + required sint32 status = 1; + required uint64 cursor_handle = 2; +}; + +message CursorCloneRequest { + required uint64 cursor_handle = 1; +}; + +message CursorCloneReply { + required sint32 status = 1; + required uint64 cursor_handle = 2; +}; + +message CursorCloseRequest { + required uint64 cursor_handle = 1; +}; + +message CursorCloseReply { + required sint32 status = 1; +}; + +message CursorInsertRequest { + required uint64 cursor_handle = 1; + optional Key key = 2; + optional Record record = 3; + required uint32 flags = 4; +}; + +message CursorInsertReply { + required sint32 status = 1; + optional Key key = 2; +}; + +message CursorEraseRequest { + required uint64 cursor_handle = 1; + required uint32 flags = 2; +}; + +message CursorEraseReply { + required sint32 status = 1; +}; + +message CursorGetRecordCountRequest { + required uint64 cursor_handle = 1; + required uint32 flags = 2; +}; + +message CursorGetRecordCountReply { + required sint32 status = 1; + required uint32 count = 2; +}; + +message CursorGetRecordSizeRequest { + required uint64 cursor_handle = 1; +}; + +message CursorGetRecordSizeReply { + required sint32 status = 1; + required uint64 size = 2; +}; + +message CursorGetDuplicatePositionRequest { + required uint64 cursor_handle = 1; +}; + +message CursorGetDuplicatePositionReply { + required sint32 status = 1; + required uint32 position = 2; +}; + +message CursorOverwriteRequest { + required uint64 cursor_handle = 1; + required Record record = 2; + required uint32 flags = 3; +}; + +message CursorOverwriteReply { + required sint32 status = 1; +}; + +message CursorMoveRequest { + required uint64 cursor_handle = 1; + optional Key key = 2; + optional Record record = 3; + required uint32 flags = 4; +}; + +message CursorMoveReply { + required sint32 status = 1; + optional Key key = 2; + optional Record record = 3; +}; diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/protocol.h b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/protocol.h new file mode 100644 index 0000000000..8a2ab9d49f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protobuf/protocol.h @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Abstraction layer for the remote protocol + * + * @exception_safe: no + * @thread_safe: no + */ + +#ifndef HAM_PROTOCOL_H +#define HAM_PROTOCOL_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1mem/mem.h" +#include "1base/error.h" +#include "1base/dynamic_array.h" +#include "2protobuf/messages.pb.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +/** a magic and version indicator for the remote protocol */ +#define HAM_TRANSFER_MAGIC_V1 (('h'<<24)|('a'<<16)|('m'<<8)|'1') + +/** + * the Protocol class maps a single message that is exchanged between + * client and server + */ +class Protocol : public hamsterdb::ProtoWrapper +{ + public: + Protocol() { } + + /** constructor - assigns a type */ + Protocol(hamsterdb::ProtoWrapper_Type type) { + set_type(type); + } + + /** helper function which copies a ham_key_t into a ProtoBuf key */ + static void assign_key(hamsterdb::Key *protokey, ham_key_t *hamkey, + bool deep_copy = true) { + if (deep_copy) + protokey->set_data(hamkey->data, hamkey->size); + protokey->set_flags(hamkey->flags); + protokey->set_intflags(hamkey->_flags); + } + + /** helper function which copies a ham_record_t into a ProtoBuf record */ + static void assign_record(hamsterdb::Record *protorec, + ham_record_t *hamrec, bool deep_copy = true) { + if (deep_copy) + protorec->set_data(hamrec->data, hamrec->size); + protorec->set_flags(hamrec->flags); + protorec->set_partial_offset(hamrec->partial_offset); + protorec->set_partial_size(hamrec->partial_size); + } + + /** + * Factory function; creates a new Protocol structure from a serialized + * buffer + */ + static Protocol *unpack(const uint8_t *buf, uint32_t size) { + if (*(uint32_t *)&buf[0] != HAM_TRANSFER_MAGIC_V1) { + ham_trace(("invalid protocol version")); + return (0); + } + + Protocol *p = new Protocol; + if (!p->ParseFromArray(buf + 8, size - 8)) { + delete p; + return (0); + } + return (p); + } + + /* + * Packs the Protocol structure into a memory buffer and returns + * a pointer to the buffer and the buffer size + */ + bool pack(uint8_t **data, uint32_t *size) { + uint32_t packed_size = ByteSize(); + /* we need 8 more bytes for magic and size */ + uint8_t *p = Memory::allocate<uint8_t>(packed_size + 8); + if (!p) + return (false); + + /* write the magic and the payload size of the packed structure */ + *(uint32_t *)&p[0] = HAM_TRANSFER_MAGIC_V1; + *(uint32_t *)&p[4] = packed_size; + + /* now write the packed structure */ + if (!SerializeToArray(&p[8], packed_size)) { + Memory::release(p); + return (false); + } + + *data = p; + *size = packed_size + 8; + return (true); + } + + /* + * Packs the Protocol structure into a ByteArray + */ + bool pack(ByteArray *barray) { + uint32_t packed_size = ByteSize(); + /* we need 8 more bytes for magic and size */ + uint8_t *p = (uint8_t *)barray->resize(packed_size + 8); + if (!p) + return (false); + + /* write the magic and the payload size of the packed structure */ + *(uint32_t *)&p[0] = HAM_TRANSFER_MAGIC_V1; + *(uint32_t *)&p[4] = packed_size; + + /* now write the packed structure */ + return (SerializeToArray(&p[8], packed_size)); + } + + /** + * shutdown/free globally allocated memory + */ + static void shutdown() { + google::protobuf::ShutdownProtobufLibrary(); + } +}; + +#endif /* HAM_PROTOCOL_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/Makefile.am b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/Makefile.am new file mode 100644 index 0000000000..cf5a3fb3ec --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/Makefile.am @@ -0,0 +1,5 @@ + +EXTRA_DIST = messages.h messages.proto + +gen proto: + cat messages.proto | ../../bin/genserializer.pl > messages.h diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/Makefile.in b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/Makefile.in new file mode 100644 index 0000000000..da966a01d3 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/Makefile.in @@ -0,0 +1,451 @@ +# Makefile.in generated by automake 1.14.1 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994-2013 Free Software Foundation, Inc. + +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ +VPATH = @srcdir@ +am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)' +am__make_running_with_option = \ + case $${target_option-} in \ + ?) ;; \ + *) echo "am__make_running_with_option: internal error: invalid" \ + "target option '$${target_option-}' specified" >&2; \ + exit 1;; \ + esac; \ + has_opt=no; \ + sane_makeflags=$$MAKEFLAGS; \ + if $(am__is_gnu_make); then \ + sane_makeflags=$$MFLAGS; \ + else \ + case $$MAKEFLAGS in \ + *\\[\ \ ]*) \ + bs=\\; \ + sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \ + | sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \ + esac; \ + fi; \ + skip_next=no; \ + strip_trailopt () \ + { \ + flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \ + }; \ + for flg in $$sane_makeflags; do \ + test $$skip_next = yes && { skip_next=no; continue; }; \ + case $$flg in \ + *=*|--*) continue;; \ + -*I) strip_trailopt 'I'; skip_next=yes;; \ + -*I?*) strip_trailopt 'I';; \ + -*O) strip_trailopt 'O'; skip_next=yes;; \ + -*O?*) strip_trailopt 'O';; \ + -*l) strip_trailopt 'l'; skip_next=yes;; \ + -*l?*) strip_trailopt 'l';; \ + -[dEDm]) skip_next=yes;; \ + -[JT]) skip_next=yes;; \ + esac; \ + case $$flg in \ + *$$target_option*) has_opt=yes; break;; \ + esac; \ + done; \ + test $$has_opt = yes +am__make_dryrun = (target_option=n; $(am__make_running_with_option)) +am__make_keepgoing = (target_option=k; $(am__make_running_with_option)) +pkgdatadir = $(datadir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkglibexecdir = $(libexecdir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +build_triplet = @build@ +host_triplet = @host@ +subdir = src/2protoserde +DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/m4/libtool.m4 \ + $(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \ + $(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \ + $(top_srcdir)/m4/ax_cxx_gcc_abi_demangle.m4 \ + $(top_srcdir)/m4/boost.m4 $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +CONFIG_CLEAN_VPATH_FILES = +AM_V_P = $(am__v_P_@AM_V@) +am__v_P_ = $(am__v_P_@AM_DEFAULT_V@) +am__v_P_0 = false +am__v_P_1 = : +AM_V_GEN = $(am__v_GEN_@AM_V@) +am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@) +am__v_GEN_0 = @echo " GEN " $@; +am__v_GEN_1 = +AM_V_at = $(am__v_at_@AM_V@) +am__v_at_ = $(am__v_at_@AM_DEFAULT_V@) +am__v_at_0 = @ +am__v_at_1 = +SOURCES = +DIST_SOURCES = +am__can_run_installinfo = \ + case $$AM_UPDATE_INFO_DIR in \ + n|no|NO) false;; \ + *) (install-info --version) >/dev/null 2>&1;; \ + esac +am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP) +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ +AR = @AR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +BOOST_CHRONO_LDFLAGS = @BOOST_CHRONO_LDFLAGS@ +BOOST_CHRONO_LDPATH = @BOOST_CHRONO_LDPATH@ +BOOST_CHRONO_LIBS = @BOOST_CHRONO_LIBS@ +BOOST_CPPFLAGS = @BOOST_CPPFLAGS@ +BOOST_FILESYSTEM_LDFLAGS = @BOOST_FILESYSTEM_LDFLAGS@ +BOOST_FILESYSTEM_LDPATH = @BOOST_FILESYSTEM_LDPATH@ +BOOST_FILESYSTEM_LIBS = @BOOST_FILESYSTEM_LIBS@ +BOOST_LDPATH = @BOOST_LDPATH@ +BOOST_ROOT = @BOOST_ROOT@ +BOOST_SYSTEM_LDFLAGS = @BOOST_SYSTEM_LDFLAGS@ +BOOST_SYSTEM_LDPATH = @BOOST_SYSTEM_LDPATH@ +BOOST_SYSTEM_LIBS = @BOOST_SYSTEM_LIBS@ +BOOST_THREAD_LDFLAGS = @BOOST_THREAD_LDFLAGS@ +BOOST_THREAD_LDPATH = @BOOST_THREAD_LDPATH@ +BOOST_THREAD_LIBS = @BOOST_THREAD_LIBS@ +BOOST_THREAD_WIN32_LDFLAGS = @BOOST_THREAD_WIN32_LDFLAGS@ +BOOST_THREAD_WIN32_LDPATH = @BOOST_THREAD_WIN32_LDPATH@ +BOOST_THREAD_WIN32_LIBS = @BOOST_THREAD_WIN32_LIBS@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXCPP = @CXXCPP@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +DISTCHECK_CONFIGURE_FLAGS = @DISTCHECK_CONFIGURE_FLAGS@ +DLLTOOL = @DLLTOOL@ +DSYMUTIL = @DSYMUTIL@ +DUMPBIN = @DUMPBIN@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +FGREP = @FGREP@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +JDK_INCLUDE = @JDK_INCLUDE@ +LD = @LD@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LIBTOOL = @LIBTOOL@ +LIPO = @LIPO@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAKEINFO = @MAKEINFO@ +MANIFEST_TOOL = @MANIFEST_TOOL@ +MKDIR_P = @MKDIR_P@ +NM = @NM@ +NMEDIT = @NMEDIT@ +OBJDUMP = @OBJDUMP@ +OBJEXT = @OBJEXT@ +OTOOL = @OTOOL@ +OTOOL64 = @OTOOL64@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_URL = @PACKAGE_URL@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +RANLIB = @RANLIB@ +SED = @SED@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_AR = @ac_ct_AR@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +ac_ct_DUMPBIN = @ac_ct_DUMPBIN@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build = @build@ +build_alias = @build_alias@ +build_cpu = @build_cpu@ +build_os = @build_os@ +build_vendor = @build_vendor@ +builddir = @builddir@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +exec_prefix = @exec_prefix@ +host = @host@ +host_alias = @host_alias@ +host_cpu = @host_cpu@ +host_os = @host_os@ +host_vendor = @host_vendor@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +srcdir = @srcdir@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_build_prefix = @top_build_prefix@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +EXTRA_DIST = messages.h messages.proto +all: all-am + +.SUFFIXES: +$(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \ + && { if test -f $@; then exit 0; else break; fi; }; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/2protoserde/Makefile'; \ + $(am__cd) $(top_srcdir) && \ + $(AUTOMAKE) --foreign src/2protoserde/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(am__aclocal_m4_deps): + +mostlyclean-libtool: + -rm -f *.lo + +clean-libtool: + -rm -rf .libs _libs +tags TAGS: + +ctags CTAGS: + +cscope cscopelist: + + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d "$(distdir)/$$file"; then \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \ + find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \ + fi; \ + cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \ + else \ + test -f "$(distdir)/$$file" \ + || cp -p $$d/$$file "$(distdir)/$$file" \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +all-am: Makefile +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + if test -z '$(STRIP)'; then \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + install; \ + else \ + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \ + fi +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +clean: clean-am + +clean-am: clean-generic clean-libtool mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic + +dvi: dvi-am + +dvi-am: + +html: html-am + +html-am: + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-dvi-am: + +install-exec-am: + +install-html: install-html-am + +install-html-am: + +install-info: install-info-am + +install-info-am: + +install-man: + +install-pdf: install-pdf-am + +install-pdf-am: + +install-ps: install-ps-am + +install-ps-am: + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic mostlyclean-libtool + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: all all-am check check-am clean clean-generic clean-libtool \ + cscopelist-am ctags-am distclean distclean-generic \ + distclean-libtool distdir dvi dvi-am html html-am info info-am \ + install install-am install-data install-data-am install-dvi \ + install-dvi-am install-exec install-exec-am install-html \ + install-html-am install-info install-info-am install-man \ + install-pdf install-pdf-am install-ps install-ps-am \ + install-strip installcheck installcheck-am installdirs \ + maintainer-clean maintainer-clean-generic mostlyclean \ + mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \ + tags-am uninstall uninstall-am + + +gen proto: + cat messages.proto | ../../bin/genserializer.pl > messages.h + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/messages.h b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/messages.h new file mode 100644 index 0000000000..38d091dd8f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/messages.h @@ -0,0 +1,1839 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_MESSAGES_H +#define HAM_MESSAGES_H + +#include "0root/root.h" + +#include <assert.h> + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +/** a magic and version indicator for the remote protocol */ +#define HAM_TRANSFER_MAGIC_V2 (('h'<<24)|('a'<<16)|('m'<<8)|'2') + +namespace hamsterdb { + +enum { + kTxnBeginRequest, + kTxnBeginReply, + kTxnCommitRequest, + kTxnCommitReply, + kTxnAbortRequest, + kTxnAbortReply, + kDbGetKeyCountRequest, + kDbGetKeyCountReply, + kDbInsertRequest, + kDbInsertReply, + kDbEraseRequest, + kDbEraseReply, + kDbFindRequest, + kDbFindReply, + kCursorCreateRequest, + kCursorCreateReply, + kCursorCloneRequest, + kCursorCloneReply, + kCursorCloseRequest, + kCursorCloseReply, + kCursorInsertRequest, + kCursorInsertReply, + kCursorEraseRequest, + kCursorEraseReply, + kCursorGetRecordCountRequest, + kCursorGetRecordCountReply, + kCursorGetRecordSizeRequest, + kCursorGetRecordSizeReply, + kCursorGetDuplicatePositionRequest, + kCursorGetDuplicatePositionReply, + kCursorOverwriteRequest, + kCursorOverwriteReply, + kCursorMoveRequest, + kCursorMoveReply +}; + +template<typename Ex, typename In> +struct Serialized_Base { + Ex value; + + Serialized_Base() { + clear(); + } + + Serialized_Base(const Ex &t) + : value((In)t) { + } + + operator Ex() { + return (value); + } + + void clear() { + value = (Ex)0; + } + + size_t get_size() const { + return (sizeof(In)); + } + + void serialize(unsigned char **pptr, int *psize) const { + *(In *)*pptr = (In)value; + *pptr += sizeof(In); + *psize -= sizeof(In); + assert(*psize >= 0); + } + + void deserialize(unsigned char **pptr, int *psize) { + value = (Ex) *(In *)*pptr; + *pptr += sizeof(In); + *psize -= sizeof(In); + assert(*psize >= 0); + } +}; + +struct SerializedBytes { + uint8_t *value; + uint32_t size; + + SerializedBytes() { + clear(); + } + + size_t align(size_t s) const { + if (s % 4) return (s + 4 - (s % 4)); + return (s); + } + + void clear() { + value = 0; size = 0; + } + + size_t get_size() const { + return (sizeof(uint32_t) + align(size)); // align to 32bits + } + + void serialize(unsigned char **pptr, int *psize) const { + *(uint32_t *)*pptr = size; + *pptr += sizeof(uint32_t); + *psize -= sizeof(uint32_t); + if (size) { + memcpy(*pptr, value, size); + *pptr += align(size); // align to 32bits + *psize -= align(size); + assert(*psize >= 0); + } + } + + void deserialize(unsigned char **pptr, int *psize) { + size = *(uint32_t *)*pptr; + *pptr += sizeof(uint32_t); + *psize -= sizeof(uint32_t); + if (size) { + value = *pptr; + *pptr += align(size); // align to 32bits + *psize -= align(size); + assert(*psize >= 0); + } + else + value = 0; + } +}; + +typedef Serialized_Base<bool, uint32_t> SerializedBool; +typedef Serialized_Base<uint8_t, uint32_t> SerializedUint8; +typedef Serialized_Base<uint16_t, uint32_t> SerializedUint16; +typedef Serialized_Base<uint32_t, uint32_t> SerializedUint32; +typedef Serialized_Base<int8_t, int32_t> SerializedSint8; +typedef Serialized_Base<int16_t, int32_t> SerializedSint16; +typedef Serialized_Base<int32_t, int32_t> SerializedSint32; +typedef Serialized_Base<uint64_t, uint64_t> SerializedUint64; +typedef Serialized_Base<int64_t, int64_t> SerializedSint64; + + +struct SerializedKey { + SerializedBool has_data; + SerializedBytes data; + SerializedUint32 flags; + SerializedUint32 intflags; + + SerializedKey() { + clear(); + } + + size_t get_size() const { + return ( + has_data.get_size() + + (has_data.value ? data.get_size() : 0) + + flags.get_size() + + intflags.get_size() + + 0); + } + + void clear() { + has_data = false; + data.clear(); + flags.clear(); + intflags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + has_data.serialize(pptr, psize); + if (has_data.value) data.serialize(pptr, psize); + flags.serialize(pptr, psize); + intflags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + has_data.deserialize(pptr, psize); + if (has_data.value) data.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + intflags.deserialize(pptr, psize); + } +}; + +struct SerializedRecord { + SerializedBool has_data; + SerializedBytes data; + SerializedUint32 flags; + SerializedUint32 partial_offset; + SerializedUint32 partial_size; + + SerializedRecord() { + clear(); + } + + size_t get_size() const { + return ( + has_data.get_size() + + (has_data.value ? data.get_size() : 0) + + flags.get_size() + + partial_offset.get_size() + + partial_size.get_size() + + 0); + } + + void clear() { + has_data = false; + data.clear(); + flags.clear(); + partial_offset.clear(); + partial_size.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + has_data.serialize(pptr, psize); + if (has_data.value) data.serialize(pptr, psize); + flags.serialize(pptr, psize); + partial_offset.serialize(pptr, psize); + partial_size.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + has_data.deserialize(pptr, psize); + if (has_data.value) data.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + partial_offset.deserialize(pptr, psize); + partial_size.deserialize(pptr, psize); + } +}; + +struct SerializedConnectRequest { + SerializedBytes path; + + SerializedConnectRequest() { + clear(); + } + + size_t get_size() const { + return ( + path.get_size() + + 0); + } + + void clear() { + path.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + path.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + path.deserialize(pptr, psize); + } +}; + +struct SerializedConnectReply { + SerializedSint32 status; + SerializedUint32 env_flags; + SerializedUint64 env_handle; + + SerializedConnectReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + env_flags.get_size() + + env_handle.get_size() + + 0); + } + + void clear() { + status.clear(); + env_flags.clear(); + env_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + env_flags.serialize(pptr, psize); + env_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + env_flags.deserialize(pptr, psize); + env_handle.deserialize(pptr, psize); + } +}; + +struct SerializedTxnBeginRequest { + SerializedUint64 env_handle; + SerializedUint32 flags; + SerializedBytes name; + + SerializedTxnBeginRequest() { + clear(); + } + + size_t get_size() const { + return ( + env_handle.get_size() + + flags.get_size() + + name.get_size() + + 0); + } + + void clear() { + env_handle.clear(); + flags.clear(); + name.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + env_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + name.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + env_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + name.deserialize(pptr, psize); + } +}; + +struct SerializedTxnBeginReply { + SerializedSint32 status; + SerializedUint64 txn_handle; + + SerializedTxnBeginReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + txn_handle.get_size() + + 0); + } + + void clear() { + status.clear(); + txn_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + txn_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + txn_handle.deserialize(pptr, psize); + } +}; + +struct SerializedTxnCommitRequest { + SerializedUint64 txn_handle; + SerializedUint32 flags; + + SerializedTxnCommitRequest() { + clear(); + } + + size_t get_size() const { + return ( + txn_handle.get_size() + + flags.get_size() + + 0); + } + + void clear() { + txn_handle.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + txn_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + txn_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedTxnCommitReply { + SerializedSint32 status; + + SerializedTxnCommitReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + 0); + } + + void clear() { + status.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + } +}; + +struct SerializedTxnAbortRequest { + SerializedUint64 txn_handle; + SerializedUint32 flags; + + SerializedTxnAbortRequest() { + clear(); + } + + size_t get_size() const { + return ( + txn_handle.get_size() + + flags.get_size() + + 0); + } + + void clear() { + txn_handle.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + txn_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + txn_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedTxnAbortReply { + SerializedSint32 status; + + SerializedTxnAbortReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + 0); + } + + void clear() { + status.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + } +}; + +struct SerializedDbGetKeyCountRequest { + SerializedUint64 db_handle; + SerializedUint64 txn_handle; + SerializedBool distinct; + + SerializedDbGetKeyCountRequest() { + clear(); + } + + size_t get_size() const { + return ( + db_handle.get_size() + + txn_handle.get_size() + + distinct.get_size() + + 0); + } + + void clear() { + db_handle.clear(); + txn_handle.clear(); + distinct.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + db_handle.serialize(pptr, psize); + txn_handle.serialize(pptr, psize); + distinct.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + db_handle.deserialize(pptr, psize); + txn_handle.deserialize(pptr, psize); + distinct.deserialize(pptr, psize); + } +}; + +struct SerializedDbGetKeyCountReply { + SerializedSint32 status; + SerializedUint64 keycount; + + SerializedDbGetKeyCountReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + keycount.get_size() + + 0); + } + + void clear() { + status.clear(); + keycount.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + keycount.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + keycount.deserialize(pptr, psize); + } +}; + +struct SerializedDbInsertRequest { + SerializedUint64 db_handle; + SerializedUint64 txn_handle; + SerializedUint32 flags; + SerializedBool has_key; + SerializedKey key; + SerializedBool has_record; + SerializedRecord record; + + SerializedDbInsertRequest() { + clear(); + } + + size_t get_size() const { + return ( + db_handle.get_size() + + txn_handle.get_size() + + flags.get_size() + + has_key.get_size() + + (has_key.value ? key.get_size() : 0) + + has_record.get_size() + + (has_record.value ? record.get_size() : 0) + + 0); + } + + void clear() { + db_handle.clear(); + txn_handle.clear(); + flags.clear(); + has_key = false; + key.clear(); + has_record = false; + record.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + db_handle.serialize(pptr, psize); + txn_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + has_key.serialize(pptr, psize); + if (has_key.value) key.serialize(pptr, psize); + has_record.serialize(pptr, psize); + if (has_record.value) record.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + db_handle.deserialize(pptr, psize); + txn_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + has_key.deserialize(pptr, psize); + if (has_key.value) key.deserialize(pptr, psize); + has_record.deserialize(pptr, psize); + if (has_record.value) record.deserialize(pptr, psize); + } +}; + +struct SerializedDbInsertReply { + SerializedSint32 status; + SerializedBool has_key; + SerializedKey key; + + SerializedDbInsertReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + has_key.get_size() + + (has_key.value ? key.get_size() : 0) + + 0); + } + + void clear() { + status.clear(); + has_key = false; + key.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + has_key.serialize(pptr, psize); + if (has_key.value) key.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + has_key.deserialize(pptr, psize); + if (has_key.value) key.deserialize(pptr, psize); + } +}; + +struct SerializedDbEraseRequest { + SerializedUint64 db_handle; + SerializedUint64 txn_handle; + SerializedKey key; + SerializedUint32 flags; + + SerializedDbEraseRequest() { + clear(); + } + + size_t get_size() const { + return ( + db_handle.get_size() + + txn_handle.get_size() + + key.get_size() + + flags.get_size() + + 0); + } + + void clear() { + db_handle.clear(); + txn_handle.clear(); + key.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + db_handle.serialize(pptr, psize); + txn_handle.serialize(pptr, psize); + key.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + db_handle.deserialize(pptr, psize); + txn_handle.deserialize(pptr, psize); + key.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedDbEraseReply { + SerializedSint32 status; + + SerializedDbEraseReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + 0); + } + + void clear() { + status.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + } +}; + +struct SerializedDbFindRequest { + SerializedUint64 db_handle; + SerializedUint64 txn_handle; + SerializedUint64 cursor_handle; + SerializedUint32 flags; + SerializedKey key; + SerializedBool has_record; + SerializedRecord record; + + SerializedDbFindRequest() { + clear(); + } + + size_t get_size() const { + return ( + db_handle.get_size() + + txn_handle.get_size() + + cursor_handle.get_size() + + flags.get_size() + + key.get_size() + + has_record.get_size() + + (has_record.value ? record.get_size() : 0) + + 0); + } + + void clear() { + db_handle.clear(); + txn_handle.clear(); + cursor_handle.clear(); + flags.clear(); + key.clear(); + has_record = false; + record.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + db_handle.serialize(pptr, psize); + txn_handle.serialize(pptr, psize); + cursor_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + key.serialize(pptr, psize); + has_record.serialize(pptr, psize); + if (has_record.value) record.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + db_handle.deserialize(pptr, psize); + txn_handle.deserialize(pptr, psize); + cursor_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + key.deserialize(pptr, psize); + has_record.deserialize(pptr, psize); + if (has_record.value) record.deserialize(pptr, psize); + } +}; + +struct SerializedDbFindReply { + SerializedSint32 status; + SerializedBool has_key; + SerializedKey key; + SerializedBool has_record; + SerializedRecord record; + + SerializedDbFindReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + has_key.get_size() + + (has_key.value ? key.get_size() : 0) + + has_record.get_size() + + (has_record.value ? record.get_size() : 0) + + 0); + } + + void clear() { + status.clear(); + has_key = false; + key.clear(); + has_record = false; + record.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + has_key.serialize(pptr, psize); + if (has_key.value) key.serialize(pptr, psize); + has_record.serialize(pptr, psize); + if (has_record.value) record.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + has_key.deserialize(pptr, psize); + if (has_key.value) key.deserialize(pptr, psize); + has_record.deserialize(pptr, psize); + if (has_record.value) record.deserialize(pptr, psize); + } +}; + +struct SerializedCursorCreateRequest { + SerializedUint64 db_handle; + SerializedUint64 txn_handle; + SerializedUint32 flags; + + SerializedCursorCreateRequest() { + clear(); + } + + size_t get_size() const { + return ( + db_handle.get_size() + + txn_handle.get_size() + + flags.get_size() + + 0); + } + + void clear() { + db_handle.clear(); + txn_handle.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + db_handle.serialize(pptr, psize); + txn_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + db_handle.deserialize(pptr, psize); + txn_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedCursorCreateReply { + SerializedSint32 status; + SerializedUint64 cursor_handle; + + SerializedCursorCreateReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + cursor_handle.get_size() + + 0); + } + + void clear() { + status.clear(); + cursor_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + cursor_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + cursor_handle.deserialize(pptr, psize); + } +}; + +struct SerializedCursorCloneRequest { + SerializedUint64 cursor_handle; + + SerializedCursorCloneRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + } +}; + +struct SerializedCursorCloneReply { + SerializedSint32 status; + SerializedUint64 cursor_handle; + + SerializedCursorCloneReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + cursor_handle.get_size() + + 0); + } + + void clear() { + status.clear(); + cursor_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + cursor_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + cursor_handle.deserialize(pptr, psize); + } +}; + +struct SerializedCursorCloseRequest { + SerializedUint64 cursor_handle; + + SerializedCursorCloseRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + } +}; + +struct SerializedCursorCloseReply { + SerializedSint32 status; + + SerializedCursorCloseReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + 0); + } + + void clear() { + status.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + } +}; + +struct SerializedCursorInsertRequest { + SerializedUint64 cursor_handle; + SerializedUint32 flags; + SerializedBool has_key; + SerializedKey key; + SerializedBool has_record; + SerializedRecord record; + + SerializedCursorInsertRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + flags.get_size() + + has_key.get_size() + + (has_key.value ? key.get_size() : 0) + + has_record.get_size() + + (has_record.value ? record.get_size() : 0) + + 0); + } + + void clear() { + cursor_handle.clear(); + flags.clear(); + has_key = false; + key.clear(); + has_record = false; + record.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + has_key.serialize(pptr, psize); + if (has_key.value) key.serialize(pptr, psize); + has_record.serialize(pptr, psize); + if (has_record.value) record.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + has_key.deserialize(pptr, psize); + if (has_key.value) key.deserialize(pptr, psize); + has_record.deserialize(pptr, psize); + if (has_record.value) record.deserialize(pptr, psize); + } +}; + +struct SerializedCursorInsertReply { + SerializedSint32 status; + SerializedBool has_key; + SerializedKey key; + + SerializedCursorInsertReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + has_key.get_size() + + (has_key.value ? key.get_size() : 0) + + 0); + } + + void clear() { + status.clear(); + has_key = false; + key.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + has_key.serialize(pptr, psize); + if (has_key.value) key.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + has_key.deserialize(pptr, psize); + if (has_key.value) key.deserialize(pptr, psize); + } +}; + +struct SerializedCursorEraseRequest { + SerializedUint64 cursor_handle; + SerializedUint32 flags; + + SerializedCursorEraseRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + flags.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedCursorEraseReply { + SerializedSint32 status; + + SerializedCursorEraseReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + 0); + } + + void clear() { + status.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + } +}; + +struct SerializedCursorGetRecordCountRequest { + SerializedUint64 cursor_handle; + SerializedUint32 flags; + + SerializedCursorGetRecordCountRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + flags.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedCursorGetRecordCountReply { + SerializedSint32 status; + SerializedUint32 count; + + SerializedCursorGetRecordCountReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + count.get_size() + + 0); + } + + void clear() { + status.clear(); + count.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + count.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + count.deserialize(pptr, psize); + } +}; + +struct SerializedCursorGetRecordSizeRequest { + SerializedUint64 cursor_handle; + + SerializedCursorGetRecordSizeRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + } +}; + +struct SerializedCursorGetRecordSizeReply { + SerializedSint32 status; + SerializedUint64 size; + + SerializedCursorGetRecordSizeReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + size.get_size() + + 0); + } + + void clear() { + status.clear(); + size.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + size.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + size.deserialize(pptr, psize); + } +}; + +struct SerializedCursorGetDuplicatePositionRequest { + SerializedUint64 cursor_handle; + + SerializedCursorGetDuplicatePositionRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + } +}; + +struct SerializedCursorGetDuplicatePositionReply { + SerializedSint32 status; + SerializedUint32 position; + + SerializedCursorGetDuplicatePositionReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + position.get_size() + + 0); + } + + void clear() { + status.clear(); + position.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + position.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + position.deserialize(pptr, psize); + } +}; + +struct SerializedCursorOverwriteRequest { + SerializedUint64 cursor_handle; + SerializedRecord record; + SerializedUint32 flags; + + SerializedCursorOverwriteRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + record.get_size() + + flags.get_size() + + 0); + } + + void clear() { + cursor_handle.clear(); + record.clear(); + flags.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + record.serialize(pptr, psize); + flags.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + record.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + } +}; + +struct SerializedCursorOverwriteReply { + SerializedSint32 status; + + SerializedCursorOverwriteReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + 0); + } + + void clear() { + status.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + } +}; + +struct SerializedCursorMoveRequest { + SerializedUint64 cursor_handle; + SerializedUint32 flags; + SerializedBool has_key; + SerializedKey key; + SerializedBool has_record; + SerializedRecord record; + + SerializedCursorMoveRequest() { + clear(); + } + + size_t get_size() const { + return ( + cursor_handle.get_size() + + flags.get_size() + + has_key.get_size() + + (has_key.value ? key.get_size() : 0) + + has_record.get_size() + + (has_record.value ? record.get_size() : 0) + + 0); + } + + void clear() { + cursor_handle.clear(); + flags.clear(); + has_key = false; + key.clear(); + has_record = false; + record.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + cursor_handle.serialize(pptr, psize); + flags.serialize(pptr, psize); + has_key.serialize(pptr, psize); + if (has_key.value) key.serialize(pptr, psize); + has_record.serialize(pptr, psize); + if (has_record.value) record.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + cursor_handle.deserialize(pptr, psize); + flags.deserialize(pptr, psize); + has_key.deserialize(pptr, psize); + if (has_key.value) key.deserialize(pptr, psize); + has_record.deserialize(pptr, psize); + if (has_record.value) record.deserialize(pptr, psize); + } +}; + +struct SerializedCursorMoveReply { + SerializedSint32 status; + SerializedKey key; + SerializedRecord record; + + SerializedCursorMoveReply() { + clear(); + } + + size_t get_size() const { + return ( + status.get_size() + + key.get_size() + + record.get_size() + + 0); + } + + void clear() { + status.clear(); + key.clear(); + record.clear(); + } + + void serialize(unsigned char **pptr, int *psize) const { + status.serialize(pptr, psize); + key.serialize(pptr, psize); + record.serialize(pptr, psize); + } + + void deserialize(unsigned char **pptr, int *psize) { + status.deserialize(pptr, psize); + key.deserialize(pptr, psize); + record.deserialize(pptr, psize); + } +}; + +struct SerializedWrapper { + SerializedUint32 magic; + SerializedUint32 size; + SerializedUint32 id; + SerializedTxnBeginRequest txn_begin_request; + SerializedTxnBeginReply txn_begin_reply; + SerializedTxnCommitRequest txn_commit_request; + SerializedTxnCommitReply txn_commit_reply; + SerializedTxnAbortRequest txn_abort_request; + SerializedTxnAbortReply txn_abort_reply; + SerializedDbGetKeyCountRequest db_count_request; + SerializedDbGetKeyCountReply db_count_reply; + SerializedDbInsertRequest db_insert_request; + SerializedDbInsertReply db_insert_reply; + SerializedDbEraseRequest db_erase_request; + SerializedDbEraseReply db_erase_reply; + SerializedDbFindRequest db_find_request; + SerializedDbFindReply db_find_reply; + SerializedCursorCreateRequest cursor_create_request; + SerializedCursorCreateReply cursor_create_reply; + SerializedCursorCloneRequest cursor_clone_request; + SerializedCursorCloneReply cursor_clone_reply; + SerializedCursorCloseRequest cursor_close_request; + SerializedCursorCloseReply cursor_close_reply; + SerializedCursorInsertRequest cursor_insert_request; + SerializedCursorInsertReply cursor_insert_reply; + SerializedCursorEraseRequest cursor_erase_request; + SerializedCursorEraseReply cursor_erase_reply; + SerializedCursorGetRecordCountRequest cursor_get_record_count_request; + SerializedCursorGetRecordCountReply cursor_get_record_count_reply; + SerializedCursorGetRecordSizeRequest cursor_get_record_size_request; + SerializedCursorGetRecordSizeReply cursor_get_record_size_reply; + SerializedCursorGetDuplicatePositionRequest cursor_get_duplicate_position_request; + SerializedCursorGetDuplicatePositionReply cursor_get_duplicate_position_reply; + SerializedCursorOverwriteRequest cursor_overwrite_request; + SerializedCursorOverwriteReply cursor_overwrite_reply; + SerializedCursorMoveRequest cursor_move_request; + SerializedCursorMoveReply cursor_move_reply; + + SerializedWrapper() { + clear(); + } + + // the methods in here have a custom implementation, otherwise we would + // generate many bools for the "optional" fields, and they would + // unnecessarily increase the structure size + void clear() { + magic = 0; + size = 0; + id = 0; + } + + size_t get_size() const { + size_t s = magic.get_size() + size.get_size() + id.get_size(); + switch (id.value) { + case kTxnBeginRequest: + return (s + txn_begin_request.get_size()); + case kTxnBeginReply: + return (s + txn_begin_reply.get_size()); + case kTxnCommitRequest: + return (s + txn_commit_request.get_size()); + case kTxnCommitReply: + return (s + txn_commit_reply.get_size()); + case kTxnAbortRequest: + return (s + txn_abort_request.get_size()); + case kTxnAbortReply: + return (s + txn_abort_reply.get_size()); + case kDbGetKeyCountRequest: + return (s + db_count_request.get_size()); + case kDbGetKeyCountReply: + return (s + db_count_reply.get_size()); + case kDbInsertRequest: + return (s + db_insert_request.get_size()); + case kDbInsertReply: + return (s + db_insert_reply.get_size()); + case kDbEraseRequest: + return (s + db_erase_request.get_size()); + case kDbEraseReply: + return (s + db_erase_reply.get_size()); + case kDbFindRequest: + return (s + db_find_request.get_size()); + case kDbFindReply: + return (s + db_find_reply.get_size()); + case kCursorCreateRequest: + return (s + cursor_create_request.get_size()); + case kCursorCreateReply: + return (s + cursor_create_reply.get_size()); + case kCursorCloneRequest: + return (s + cursor_clone_request.get_size()); + case kCursorCloneReply: + return (s + cursor_clone_reply.get_size()); + case kCursorCloseRequest: + return (s + cursor_close_request.get_size()); + case kCursorCloseReply: + return (s + cursor_close_reply.get_size()); + case kCursorInsertRequest: + return (s + cursor_insert_request.get_size()); + case kCursorInsertReply: + return (s + cursor_insert_reply.get_size()); + case kCursorEraseRequest: + return (s + cursor_erase_request.get_size()); + case kCursorEraseReply: + return (s + cursor_erase_reply.get_size()); + case kCursorGetRecordCountRequest: + return (s + cursor_get_record_count_request.get_size()); + case kCursorGetRecordCountReply: + return (s + cursor_get_record_count_reply.get_size()); + case kCursorGetRecordSizeRequest: + return (s + cursor_get_record_size_request.get_size()); + case kCursorGetRecordSizeReply: + return (s + cursor_get_record_size_reply.get_size()); + case kCursorGetDuplicatePositionRequest: + return (s + cursor_get_duplicate_position_request.get_size()); + case kCursorGetDuplicatePositionReply: + return (s + cursor_get_duplicate_position_reply.get_size()); + case kCursorOverwriteRequest: + return (s + cursor_overwrite_request.get_size()); + case kCursorOverwriteReply: + return (s + cursor_overwrite_reply.get_size()); + case kCursorMoveRequest: + return (s + cursor_move_request.get_size()); + case kCursorMoveReply: + return (s + cursor_move_reply.get_size()); + default: + assert(!"shouldn't be here"); + return (0); + } + } + + void serialize(unsigned char **pptr, int *psize) const { + magic.serialize(pptr, psize); + size.serialize(pptr, psize); + id.serialize(pptr, psize); + + switch (id.value) { + case kTxnBeginRequest: + txn_begin_request.serialize(pptr, psize); + break; + case kTxnBeginReply: + txn_begin_reply.serialize(pptr, psize); + break; + case kTxnCommitRequest: + txn_commit_request.serialize(pptr, psize); + break; + case kTxnCommitReply: + txn_commit_reply.serialize(pptr, psize); + break; + case kTxnAbortRequest: + txn_abort_request.serialize(pptr, psize); + break; + case kTxnAbortReply: + txn_abort_reply.serialize(pptr, psize); + break; + case kDbGetKeyCountRequest: + db_count_request.serialize(pptr, psize); + break; + case kDbGetKeyCountReply: + db_count_reply.serialize(pptr, psize); + break; + case kDbInsertRequest: + db_insert_request.serialize(pptr, psize); + break; + case kDbInsertReply: + db_insert_reply.serialize(pptr, psize); + break; + case kDbEraseRequest: + db_erase_request.serialize(pptr, psize); + break; + case kDbEraseReply: + db_erase_reply.serialize(pptr, psize); + break; + case kDbFindRequest: + db_find_request.serialize(pptr, psize); + break; + case kDbFindReply: + db_find_reply.serialize(pptr, psize); + break; + case kCursorCreateRequest: + cursor_create_request.serialize(pptr, psize); + break; + case kCursorCreateReply: + cursor_create_reply.serialize(pptr, psize); + break; + case kCursorCloneRequest: + cursor_clone_request.serialize(pptr, psize); + break; + case kCursorCloneReply: + cursor_clone_reply.serialize(pptr, psize); + break; + case kCursorCloseRequest: + cursor_close_request.serialize(pptr, psize); + break; + case kCursorCloseReply: + cursor_close_reply.serialize(pptr, psize); + break; + case kCursorInsertRequest: + cursor_insert_request.serialize(pptr, psize); + break; + case kCursorInsertReply: + cursor_insert_reply.serialize(pptr, psize); + break; + case kCursorEraseRequest: + cursor_erase_request.serialize(pptr, psize); + break; + case kCursorEraseReply: + cursor_erase_reply.serialize(pptr, psize); + break; + case kCursorGetRecordCountRequest: + cursor_get_record_count_request.serialize(pptr, psize); + break; + case kCursorGetRecordCountReply: + cursor_get_record_count_reply.serialize(pptr, psize); + break; + case kCursorGetRecordSizeRequest: + cursor_get_record_size_request.serialize(pptr, psize); + break; + case kCursorGetRecordSizeReply: + cursor_get_record_size_reply.serialize(pptr, psize); + break; + case kCursorGetDuplicatePositionRequest: + cursor_get_duplicate_position_request.serialize(pptr, psize); + break; + case kCursorGetDuplicatePositionReply: + cursor_get_duplicate_position_reply.serialize(pptr, psize); + break; + case kCursorOverwriteRequest: + cursor_overwrite_request.serialize(pptr, psize); + break; + case kCursorOverwriteReply: + cursor_overwrite_reply.serialize(pptr, psize); + break; + case kCursorMoveRequest: + cursor_move_request.serialize(pptr, psize); + break; + case kCursorMoveReply: + cursor_move_reply.serialize(pptr, psize); + break; + default: + assert(!"shouldn't be here"); + } + } + + void deserialize(unsigned char **pptr, int *psize) { + magic.deserialize(pptr, psize); + size.deserialize(pptr, psize); + id.deserialize(pptr, psize); + + switch (id.value) { + case kTxnBeginRequest: + txn_begin_request.deserialize(pptr, psize); + break; + case kTxnBeginReply: + txn_begin_reply.deserialize(pptr, psize); + break; + case kTxnCommitRequest: + txn_commit_request.deserialize(pptr, psize); + break; + case kTxnCommitReply: + txn_commit_reply.deserialize(pptr, psize); + break; + case kTxnAbortRequest: + txn_abort_request.deserialize(pptr, psize); + break; + case kTxnAbortReply: + txn_abort_reply.deserialize(pptr, psize); + break; + case kDbGetKeyCountRequest: + db_count_request.deserialize(pptr, psize); + break; + case kDbGetKeyCountReply: + db_count_reply.deserialize(pptr, psize); + break; + case kDbInsertRequest: + db_insert_request.deserialize(pptr, psize); + break; + case kDbInsertReply: + db_insert_reply.deserialize(pptr, psize); + break; + case kDbEraseRequest: + db_erase_request.deserialize(pptr, psize); + break; + case kDbEraseReply: + db_erase_reply.deserialize(pptr, psize); + break; + case kDbFindRequest: + db_find_request.deserialize(pptr, psize); + break; + case kDbFindReply: + db_find_reply.deserialize(pptr, psize); + break; + case kCursorCreateRequest: + cursor_create_request.deserialize(pptr, psize); + break; + case kCursorCreateReply: + cursor_create_reply.deserialize(pptr, psize); + break; + case kCursorCloneRequest: + cursor_clone_request.deserialize(pptr, psize); + break; + case kCursorCloneReply: + cursor_clone_reply.deserialize(pptr, psize); + break; + case kCursorCloseRequest: + cursor_close_request.deserialize(pptr, psize); + break; + case kCursorCloseReply: + cursor_close_reply.deserialize(pptr, psize); + break; + case kCursorInsertRequest: + cursor_insert_request.deserialize(pptr, psize); + break; + case kCursorInsertReply: + cursor_insert_reply.deserialize(pptr, psize); + break; + case kCursorEraseRequest: + cursor_erase_request.deserialize(pptr, psize); + break; + case kCursorEraseReply: + cursor_erase_reply.deserialize(pptr, psize); + break; + case kCursorGetRecordCountRequest: + cursor_get_record_count_request.deserialize(pptr, psize); + break; + case kCursorGetRecordCountReply: + cursor_get_record_count_reply.deserialize(pptr, psize); + break; + case kCursorGetRecordSizeRequest: + cursor_get_record_size_request.deserialize(pptr, psize); + break; + case kCursorGetRecordSizeReply: + cursor_get_record_size_reply.deserialize(pptr, psize); + break; + case kCursorGetDuplicatePositionRequest: + cursor_get_duplicate_position_request.deserialize(pptr, psize); + break; + case kCursorGetDuplicatePositionReply: + cursor_get_duplicate_position_reply.deserialize(pptr, psize); + break; + case kCursorOverwriteRequest: + cursor_overwrite_request.deserialize(pptr, psize); + break; + case kCursorOverwriteReply: + cursor_overwrite_reply.serialize(pptr, psize); + break; + case kCursorMoveRequest: + cursor_move_request.deserialize(pptr, psize); + break; + case kCursorMoveReply: + cursor_move_reply.deserialize(pptr, psize); + break; + default: + assert(!"shouldn't be here"); + } + } +}; + + +} // namespace hamsterdb +#endif // HAM_MESSAGES_H + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/messages.proto b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/messages.proto new file mode 100644 index 0000000000..cbd68bf655 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2protoserde/messages.proto @@ -0,0 +1,646 @@ +SET_OPTION(prefix, Serialized) + +PROLOGUE_BEGIN +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_MESSAGES_H +#define HAM_MESSAGES_H + +#include "0root/root.h" + +#include <assert.h> + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +/** a magic and version indicator for the remote protocol */ +#define HAM_TRANSFER_MAGIC_V2 (('h'<<24)|('a'<<16)|('m'<<8)|'2') + +namespace hamsterdb { + +enum { + kTxnBeginRequest, + kTxnBeginReply, + kTxnCommitRequest, + kTxnCommitReply, + kTxnAbortRequest, + kTxnAbortReply, + kDbGetKeyCountRequest, + kDbGetKeyCountReply, + kDbInsertRequest, + kDbInsertReply, + kDbEraseRequest, + kDbEraseReply, + kDbFindRequest, + kDbFindReply, + kCursorCreateRequest, + kCursorCreateReply, + kCursorCloneRequest, + kCursorCloneReply, + kCursorCloseRequest, + kCursorCloseReply, + kCursorInsertRequest, + kCursorInsertReply, + kCursorEraseRequest, + kCursorEraseReply, + kCursorGetRecordCountRequest, + kCursorGetRecordCountReply, + kCursorGetRecordSizeRequest, + kCursorGetRecordSizeReply, + kCursorGetDuplicatePositionRequest, + kCursorGetDuplicatePositionReply, + kCursorOverwriteRequest, + kCursorOverwriteReply, + kCursorMoveRequest, + kCursorMoveReply +}; + +PROLOGUE_END + +MESSAGE_BEGIN(Key) + optional bytes data; + uint32 flags; + uint32 intflags; +MESSAGE_END + +MESSAGE_BEGIN(Record) + optional bytes data; + uint32 flags; + uint32 partial_offset; + uint32 partial_size; +MESSAGE_END + +MESSAGE_BEGIN(ConnectRequest) + bytes path; +MESSAGE_END + +MESSAGE_BEGIN(ConnectReply) + sint32 status; + uint32 env_flags; + uint64 env_handle; +MESSAGE_END + +MESSAGE_BEGIN(TxnBeginRequest) + uint64 env_handle; + uint32 flags; + bytes name; +MESSAGE_END + +MESSAGE_BEGIN(TxnBeginReply) + sint32 status; + uint64 txn_handle; +MESSAGE_END + +MESSAGE_BEGIN(TxnCommitRequest) + uint64 txn_handle; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(TxnCommitReply) + sint32 status; +MESSAGE_END + +MESSAGE_BEGIN(TxnAbortRequest) + uint64 txn_handle; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(TxnAbortReply) + sint32 status; +MESSAGE_END + +MESSAGE_BEGIN(DbGetKeyCountRequest) + uint64 db_handle; + uint64 txn_handle; + bool distinct; +MESSAGE_END + +MESSAGE_BEGIN(DbGetKeyCountReply) + sint32 status; + uint64 keycount; +MESSAGE_END + +MESSAGE_BEGIN(DbInsertRequest) + uint64 db_handle; + uint64 txn_handle; + uint32 flags; + optional Key key; + optional Record record; +MESSAGE_END + +MESSAGE_BEGIN(DbInsertReply) + sint32 status; + optional Key key; +MESSAGE_END + +MESSAGE_BEGIN(DbEraseRequest) + uint64 db_handle; + uint64 txn_handle; + Key key; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(DbEraseReply) + sint32 status; +MESSAGE_END + +MESSAGE_BEGIN(DbFindRequest) + uint64 db_handle; + uint64 txn_handle; + uint64 cursor_handle; + uint32 flags; + Key key; + optional Record record; +MESSAGE_END + +MESSAGE_BEGIN(DbFindReply) + sint32 status; + optional Key key; + optional Record record; +MESSAGE_END + +MESSAGE_BEGIN(CursorCreateRequest) + uint64 db_handle; + uint64 txn_handle; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(CursorCreateReply) + sint32 status; + uint64 cursor_handle; +MESSAGE_END + +MESSAGE_BEGIN(CursorCloneRequest) + uint64 cursor_handle; +MESSAGE_END + +MESSAGE_BEGIN(CursorCloneReply) + sint32 status; + uint64 cursor_handle; +MESSAGE_END + +MESSAGE_BEGIN(CursorCloseRequest) + uint64 cursor_handle; +MESSAGE_END + +MESSAGE_BEGIN(CursorCloseReply) + sint32 status; +MESSAGE_END + +MESSAGE_BEGIN(CursorInsertRequest) + uint64 cursor_handle; + uint32 flags; + optional Key key; + optional Record record; +MESSAGE_END + +MESSAGE_BEGIN(CursorInsertReply) + sint32 status; + optional Key key; +MESSAGE_END + +MESSAGE_BEGIN(CursorEraseRequest) + uint64 cursor_handle; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(CursorEraseReply) + sint32 status; +MESSAGE_END + +MESSAGE_BEGIN(CursorGetRecordCountRequest) + uint64 cursor_handle; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(CursorGetRecordCountReply) + sint32 status; + uint32 count; +MESSAGE_END + +MESSAGE_BEGIN(CursorGetRecordSizeRequest) + uint64 cursor_handle; +MESSAGE_END + +MESSAGE_BEGIN(CursorGetRecordSizeReply) + sint32 status; + uint64 size; +MESSAGE_END + +MESSAGE_BEGIN(CursorGetDuplicatePositionRequest) + uint64 cursor_handle; +MESSAGE_END + +MESSAGE_BEGIN(CursorGetDuplicatePositionReply) + sint32 status; + uint32 position; +MESSAGE_END + +MESSAGE_BEGIN(CursorOverwriteRequest) + uint64 cursor_handle; + Record record; + uint32 flags; +MESSAGE_END + +MESSAGE_BEGIN(CursorOverwriteReply) + sint32 status; +MESSAGE_END + +MESSAGE_BEGIN(CursorMoveRequest) + uint64 cursor_handle; + uint32 flags; + optional Key key; + optional Record record; +MESSAGE_END + +MESSAGE_BEGIN(CursorMoveReply) + sint32 status; + Key key; + Record record; +MESSAGE_END + +MESSAGE_BEGIN(Wrapper) + uint32 magic; + uint32 size; + uint32 id; + TxnBeginRequest txn_begin_request; + TxnBeginReply txn_begin_reply; + TxnCommitRequest txn_commit_request; + TxnCommitReply txn_commit_reply; + TxnAbortRequest txn_abort_request; + TxnAbortReply txn_abort_reply; + DbGetKeyCountRequest db_count_request; + DbGetKeyCountReply db_count_reply; + DbInsertRequest db_insert_request; + DbInsertReply db_insert_reply; + DbEraseRequest db_erase_request; + DbEraseReply db_erase_reply; + DbFindRequest db_find_request; + DbFindReply db_find_reply; + CursorCreateRequest cursor_create_request; + CursorCreateReply cursor_create_reply; + CursorCloneRequest cursor_clone_request; + CursorCloneReply cursor_clone_reply; + CursorCloseRequest cursor_close_request; + CursorCloseReply cursor_close_reply; + CursorInsertRequest cursor_insert_request; + CursorInsertReply cursor_insert_reply; + CursorEraseRequest cursor_erase_request; + CursorEraseReply cursor_erase_reply; + CursorGetRecordCountRequest cursor_get_record_count_request; + CursorGetRecordCountReply cursor_get_record_count_reply; + CursorGetRecordSizeRequest cursor_get_record_size_request; + CursorGetRecordSizeReply cursor_get_record_size_reply; + CursorGetDuplicatePositionRequest cursor_get_duplicate_position_request; + CursorGetDuplicatePositionReply cursor_get_duplicate_position_reply; + CursorOverwriteRequest cursor_overwrite_request; + CursorOverwriteReply cursor_overwrite_reply; + CursorMoveRequest cursor_move_request; + CursorMoveReply cursor_move_reply; + + CUSTOM_IMPLEMENTATION_BEGIN + // the methods in here have a custom implementation, otherwise we would + // generate many bools for the "optional" fields, and they would + // unnecessarily increase the structure size + void clear() { + magic = 0; + size = 0; + id = 0; + } + + size_t get_size() const { + size_t s = magic.get_size() + size.get_size() + id.get_size(); + switch (id.value) { + case kTxnBeginRequest: + return (s + txn_begin_request.get_size()); + case kTxnBeginReply: + return (s + txn_begin_reply.get_size()); + case kTxnCommitRequest: + return (s + txn_commit_request.get_size()); + case kTxnCommitReply: + return (s + txn_commit_reply.get_size()); + case kTxnAbortRequest: + return (s + txn_abort_request.get_size()); + case kTxnAbortReply: + return (s + txn_abort_reply.get_size()); + case kDbGetKeyCountRequest: + return (s + db_count_request.get_size()); + case kDbGetKeyCountReply: + return (s + db_count_reply.get_size()); + case kDbInsertRequest: + return (s + db_insert_request.get_size()); + case kDbInsertReply: + return (s + db_insert_reply.get_size()); + case kDbEraseRequest: + return (s + db_erase_request.get_size()); + case kDbEraseReply: + return (s + db_erase_reply.get_size()); + case kDbFindRequest: + return (s + db_find_request.get_size()); + case kDbFindReply: + return (s + db_find_reply.get_size()); + case kCursorCreateRequest: + return (s + cursor_create_request.get_size()); + case kCursorCreateReply: + return (s + cursor_create_reply.get_size()); + case kCursorCloneRequest: + return (s + cursor_clone_request.get_size()); + case kCursorCloneReply: + return (s + cursor_clone_reply.get_size()); + case kCursorCloseRequest: + return (s + cursor_close_request.get_size()); + case kCursorCloseReply: + return (s + cursor_close_reply.get_size()); + case kCursorInsertRequest: + return (s + cursor_insert_request.get_size()); + case kCursorInsertReply: + return (s + cursor_insert_reply.get_size()); + case kCursorEraseRequest: + return (s + cursor_erase_request.get_size()); + case kCursorEraseReply: + return (s + cursor_erase_reply.get_size()); + case kCursorGetRecordCountRequest: + return (s + cursor_get_record_count_request.get_size()); + case kCursorGetRecordCountReply: + return (s + cursor_get_record_count_reply.get_size()); + case kCursorGetRecordSizeRequest: + return (s + cursor_get_record_size_request.get_size()); + case kCursorGetRecordSizeReply: + return (s + cursor_get_record_size_reply.get_size()); + case kCursorGetDuplicatePositionRequest: + return (s + cursor_get_duplicate_position_request.get_size()); + case kCursorGetDuplicatePositionReply: + return (s + cursor_get_duplicate_position_reply.get_size()); + case kCursorOverwriteRequest: + return (s + cursor_overwrite_request.get_size()); + case kCursorOverwriteReply: + return (s + cursor_overwrite_reply.get_size()); + case kCursorMoveRequest: + return (s + cursor_move_request.get_size()); + case kCursorMoveReply: + return (s + cursor_move_reply.get_size()); + default: + assert(!"shouldn't be here"); + return (0); + } + } + + void serialize(unsigned char **pptr, int *psize) const { + magic.serialize(pptr, psize); + size.serialize(pptr, psize); + id.serialize(pptr, psize); + + switch (id.value) { + case kTxnBeginRequest: + txn_begin_request.serialize(pptr, psize); + break; + case kTxnBeginReply: + txn_begin_reply.serialize(pptr, psize); + break; + case kTxnCommitRequest: + txn_commit_request.serialize(pptr, psize); + break; + case kTxnCommitReply: + txn_commit_reply.serialize(pptr, psize); + break; + case kTxnAbortRequest: + txn_abort_request.serialize(pptr, psize); + break; + case kTxnAbortReply: + txn_abort_reply.serialize(pptr, psize); + break; + case kDbGetKeyCountRequest: + db_count_request.serialize(pptr, psize); + break; + case kDbGetKeyCountReply: + db_count_reply.serialize(pptr, psize); + break; + case kDbInsertRequest: + db_insert_request.serialize(pptr, psize); + break; + case kDbInsertReply: + db_insert_reply.serialize(pptr, psize); + break; + case kDbEraseRequest: + db_erase_request.serialize(pptr, psize); + break; + case kDbEraseReply: + db_erase_reply.serialize(pptr, psize); + break; + case kDbFindRequest: + db_find_request.serialize(pptr, psize); + break; + case kDbFindReply: + db_find_reply.serialize(pptr, psize); + break; + case kCursorCreateRequest: + cursor_create_request.serialize(pptr, psize); + break; + case kCursorCreateReply: + cursor_create_reply.serialize(pptr, psize); + break; + case kCursorCloneRequest: + cursor_clone_request.serialize(pptr, psize); + break; + case kCursorCloneReply: + cursor_clone_reply.serialize(pptr, psize); + break; + case kCursorCloseRequest: + cursor_close_request.serialize(pptr, psize); + break; + case kCursorCloseReply: + cursor_close_reply.serialize(pptr, psize); + break; + case kCursorInsertRequest: + cursor_insert_request.serialize(pptr, psize); + break; + case kCursorInsertReply: + cursor_insert_reply.serialize(pptr, psize); + break; + case kCursorEraseRequest: + cursor_erase_request.serialize(pptr, psize); + break; + case kCursorEraseReply: + cursor_erase_reply.serialize(pptr, psize); + break; + case kCursorGetRecordCountRequest: + cursor_get_record_count_request.serialize(pptr, psize); + break; + case kCursorGetRecordCountReply: + cursor_get_record_count_reply.serialize(pptr, psize); + break; + case kCursorGetRecordSizeRequest: + cursor_get_record_size_request.serialize(pptr, psize); + break; + case kCursorGetRecordSizeReply: + cursor_get_record_size_reply.serialize(pptr, psize); + break; + case kCursorGetDuplicatePositionRequest: + cursor_get_duplicate_position_request.serialize(pptr, psize); + break; + case kCursorGetDuplicatePositionReply: + cursor_get_duplicate_position_reply.serialize(pptr, psize); + break; + case kCursorOverwriteRequest: + cursor_overwrite_request.serialize(pptr, psize); + break; + case kCursorOverwriteReply: + cursor_overwrite_reply.serialize(pptr, psize); + break; + case kCursorMoveRequest: + cursor_move_request.serialize(pptr, psize); + break; + case kCursorMoveReply: + cursor_move_reply.serialize(pptr, psize); + break; + default: + assert(!"shouldn't be here"); + } + } + + void deserialize(unsigned char **pptr, int *psize) { + magic.deserialize(pptr, psize); + size.deserialize(pptr, psize); + id.deserialize(pptr, psize); + + switch (id.value) { + case kTxnBeginRequest: + txn_begin_request.deserialize(pptr, psize); + break; + case kTxnBeginReply: + txn_begin_reply.deserialize(pptr, psize); + break; + case kTxnCommitRequest: + txn_commit_request.deserialize(pptr, psize); + break; + case kTxnCommitReply: + txn_commit_reply.deserialize(pptr, psize); + break; + case kTxnAbortRequest: + txn_abort_request.deserialize(pptr, psize); + break; + case kTxnAbortReply: + txn_abort_reply.deserialize(pptr, psize); + break; + case kDbGetKeyCountRequest: + db_count_request.deserialize(pptr, psize); + break; + case kDbGetKeyCountReply: + db_count_reply.deserialize(pptr, psize); + break; + case kDbInsertRequest: + db_insert_request.deserialize(pptr, psize); + break; + case kDbInsertReply: + db_insert_reply.deserialize(pptr, psize); + break; + case kDbEraseRequest: + db_erase_request.deserialize(pptr, psize); + break; + case kDbEraseReply: + db_erase_reply.deserialize(pptr, psize); + break; + case kDbFindRequest: + db_find_request.deserialize(pptr, psize); + break; + case kDbFindReply: + db_find_reply.deserialize(pptr, psize); + break; + case kCursorCreateRequest: + cursor_create_request.deserialize(pptr, psize); + break; + case kCursorCreateReply: + cursor_create_reply.deserialize(pptr, psize); + break; + case kCursorCloneRequest: + cursor_clone_request.deserialize(pptr, psize); + break; + case kCursorCloneReply: + cursor_clone_reply.deserialize(pptr, psize); + break; + case kCursorCloseRequest: + cursor_close_request.deserialize(pptr, psize); + break; + case kCursorCloseReply: + cursor_close_reply.deserialize(pptr, psize); + break; + case kCursorInsertRequest: + cursor_insert_request.deserialize(pptr, psize); + break; + case kCursorInsertReply: + cursor_insert_reply.deserialize(pptr, psize); + break; + case kCursorEraseRequest: + cursor_erase_request.deserialize(pptr, psize); + break; + case kCursorEraseReply: + cursor_erase_reply.deserialize(pptr, psize); + break; + case kCursorGetRecordCountRequest: + cursor_get_record_count_request.deserialize(pptr, psize); + break; + case kCursorGetRecordCountReply: + cursor_get_record_count_reply.deserialize(pptr, psize); + break; + case kCursorGetRecordSizeRequest: + cursor_get_record_size_request.deserialize(pptr, psize); + break; + case kCursorGetRecordSizeReply: + cursor_get_record_size_reply.deserialize(pptr, psize); + break; + case kCursorGetDuplicatePositionRequest: + cursor_get_duplicate_position_request.deserialize(pptr, psize); + break; + case kCursorGetDuplicatePositionReply: + cursor_get_duplicate_position_reply.deserialize(pptr, psize); + break; + case kCursorOverwriteRequest: + cursor_overwrite_request.deserialize(pptr, psize); + break; + case kCursorOverwriteReply: + cursor_overwrite_reply.serialize(pptr, psize); + break; + case kCursorMoveRequest: + cursor_move_request.deserialize(pptr, psize); + break; + case kCursorMoveReply: + cursor_move_reply.deserialize(pptr, psize); + break; + default: + assert(!"shouldn't be here"); + } + } + CUSTOM_IMPLEMENTATION_END +MESSAGE_END + + +EPILOGUE_BEGIN + +} // namespace hamsterdb +#endif // HAM_MESSAGES_H + +EPILOGUE_END diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2queue/queue.h b/plugins/Dbx_kv/src/hamsterdb/src/2queue/queue.h new file mode 100644 index 0000000000..a45d45dfa2 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2queue/queue.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A thread-safe message queue. Producers can insert at the front, Consumers + * pick messages from the tail. + * + * The queue uses a Spinlock for synchronization, but locks it only very, + * very briefly. + */ + +#ifndef HAM_QUEUE_H +#define HAM_QUEUE_H + +#include "0root/root.h" + +#include <ham/types.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/spinlock.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// The Message. Other messages can derive from it and append their own +// payload. +struct MessageBase +{ + // Message flags + enum { + // Message is mandatory and must not be skipped + kIsMandatory = 0 + }; + + MessageBase(int type_, int flags_) + : type(type_), flags(flags_), previous(0), next(0) { + } + + virtual ~MessageBase() { + } + + int type; + int flags; + MessageBase *previous; + MessageBase *next; +}; + + +class Queue +{ + public: + template<typename T> + struct Message : public MessageBase + { + Message(int type, int flags) + : MessageBase(type, flags) { + } + + T payload; + }; + + Queue() + : m_head(0), m_tail(0) { + } + + // Pushes a |message| object to the queue + void push(MessageBase *message) { + ScopedSpinlock lock(m_mutex); + if (!m_tail) { + ham_assert(m_head == 0); + m_head = m_tail = message; + } + else if (m_tail == m_head) { + m_tail->previous = message; + message->next = m_tail; + m_head = message; + } + else { + message->next = m_head; + m_head->previous = message; + m_head = message; + } + } + + // Pops a message from the tail of the queue. Returns null if the queue + // is empty. + MessageBase *pop() { + ScopedSpinlock lock(m_mutex); + if (!m_tail) { + ham_assert(m_head == 0); + return (0); + } + + MessageBase *msg = m_tail; + if (m_tail == m_head) + m_head = m_tail = 0; + else + m_tail = m_tail->previous; + return (msg); + } + + private: + // For synchronization + Spinlock m_mutex; + + // The head of the linked list (and newest MessageBase) + MessageBase *m_head; + + // The tail of the linked list (and oldest MessageBase) + MessageBase *m_tail; +}; + +} // namespace hamsterdb + +#endif // HAM_QUEUE_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/2worker/worker.h b/plugins/Dbx_kv/src/hamsterdb/src/2worker/worker.h new file mode 100644 index 0000000000..2f6798b32c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/2worker/worker.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The worker thread. Asynchronously purges the cache. Thread will start as + * soon as it's constructed. + */ + +#ifndef HAM_WORKER_H +#define HAM_WORKER_H + +#include "0root/root.h" + +#include <boost/thread.hpp> + +// Always verify that a file of level N does not include headers > N! +#include "2queue/queue.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Worker +{ + public: + Worker() + : m_stop_requested(false), m_thread(&Worker::run, this) { + } + + void add_to_queue(MessageBase *message) { + m_queue.push(message); + + ScopedLock lock(m_mutex); + m_cond.notify_one(); + } + + void stop_and_join() { + { + ScopedLock lock(m_mutex); + m_stop_requested = true; + m_cond.notify_one(); + } + m_thread.join(); + } + + private: + // The thread function + void run() { + while (true) { + MessageBase *message = 0; + { + ScopedLock lock(m_mutex); + if (m_stop_requested) + return; + message = m_queue.pop(); + if (!message) { + m_cond.wait(lock); // will unlock m_mutex while waiting + message = m_queue.pop(); + } + } + + if (message) { + handle_message(message); + delete message; + } + } + } + + // The message handler - has to be overridden + virtual void handle_message(MessageBase *message) = 0; + + // A queue for storing messages + Queue m_queue; + + // true if the Environment is closed + bool m_stop_requested; + + // A mutex for protecting |m_cond| + boost::mutex m_mutex; + + // A condition to wait for + boost::condition_variable m_cond; + + // The actual thread + boost::thread m_thread; +}; + +} // namespace hamsterdb + +#endif // HAM_WORKER_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager.cc b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager.cc new file mode 100644 index 0000000000..d0c075cdec --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager.cc @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "blob_manager.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +uint64_t +BlobManager::allocate(Context *context, ham_record_t *record, + uint32_t flags) +{ + // PARTIAL WRITE + // + // if offset+partial_size equals the full record size, then we won't + // have any gaps. In this case we just write the full record and ignore + // the partial parameters. + if (flags & HAM_PARTIAL) { + if (record->partial_offset == 0 && record->partial_size == record->size) + flags &= ~HAM_PARTIAL; + } + + m_metric_total_allocated++; + + return (do_allocate(context, record, flags)); +} + +void +BlobManager::read(Context *context, uint64_t blobid, ham_record_t *record, + uint32_t flags, ByteArray *arena) +{ + m_metric_total_read++; + + return (do_read(context, blobid, record, flags, arena)); +} + +uint64_t +BlobManager::overwrite(Context *context, uint64_t old_blobid, + ham_record_t *record, uint32_t flags) +{ + // PARTIAL WRITE + // + // if offset+partial_size equals the full record size, then we won't + // have any gaps. In this case we just write the full record and ignore + // the partial parameters. + if (flags & HAM_PARTIAL) { + if (record->partial_offset == 0 && record->partial_size == record->size) + flags &= ~HAM_PARTIAL; + } + + return (do_overwrite(context, old_blobid, record, flags)); +} + +uint64_t +BlobManager::get_blob_size(Context *context, uint64_t blob_id) +{ + return (do_get_blob_size(context, blob_id)); +} + +void +BlobManager::erase(Context *context, uint64_t blob_id, Page *page, + uint32_t flags) +{ + return (do_erase(context, blob_id, page, flags)); +} + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager.h b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager.h new file mode 100644 index 0000000000..208345e2ed --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager.h @@ -0,0 +1,231 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @brief functions for reading/writing/allocating blobs (memory chunks of + * arbitrary size) + * + */ + +#ifndef HAM_BLOB_MANAGER_H +#define HAM_BLOB_MANAGER_H + +#include "0root/root.h" + +#include "ham/hamsterdb_int.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/dynamic_array.h" +#include "2page/page.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class LocalEnvironment; + +#include "1base/packstart.h" + +// A blob header structure +// +// This header is prepended to the blob's payload. It holds the blob size and +// the blob's address (which is not required but useful for error checking.) +HAM_PACK_0 class HAM_PACK_1 PBlobHeader +{ + public: + PBlobHeader() { + memset(this, 0, sizeof(PBlobHeader)); + } + + // Returns a PBlobHeader from a file address + static PBlobHeader *from_page(Page *page, uint64_t address) { + uint32_t readstart = (uint32_t)(address - page->get_address()); + return (PBlobHeader *)&page->get_raw_payload()[readstart]; + } + + // Returns the blob flags + uint32_t get_flags() const { + return (m_flags); + } + + // Sets the blob's flags + void set_flags(uint32_t flags) { + m_flags = flags; + } + + // Returns the absolute address of the blob + uint64_t get_self() const { + return (m_blobid); + } + + // Sets the absolute address of the blob + void set_self(uint64_t id) { + m_blobid = id; + } + + // Returns the payload size of the blob + uint64_t get_size() const { + return (m_size); + } + + // Sets the payload size of the blob + void set_size(uint64_t size) { + m_size = size; + } + + // Returns the allocated size of the blob (includes padding) + uint64_t get_alloc_size() const { + return (m_allocated_size); + } + + // Sets the allocated size of a blob (includes padding) + void set_alloc_size(uint64_t size) { + m_allocated_size = size; + } + + private: + // Flags; currently only used in hamsterdb-pro to store compression + // information + uint32_t m_flags; + + // The blob ID - which is the absolute address/offset of this + //* structure in the file + uint64_t m_blobid; + + // The allocated size of the blob; this is the size, which is used + // by the blob and it's header and maybe additional padding + uint64_t m_allocated_size; + + // The "real" size of the blob (excluding the header) + uint64_t m_size; +} HAM_PACK_2; + +#include "1base/packstop.h" + +// The BlobManager manages blobs (not a surprise) +// +// This is an abstract baseclass, derived for In-Memory- and Disk-based +// Environments. +class BlobManager +{ + protected: + // Flags for the PBlobHeader structure + enum { + // Blob is compressed + kIsCompressed = 1 + }; + + public: + // Flags for allocate(); make sure that they do not conflict with + // the flags for ham_db_insert() + enum { + // Do not compress the blob, even if compression is enabled + kDisableCompression = 0x10000000 + }; + + BlobManager(LocalEnvironment *env) + : m_env(env), m_metric_before_compression(0), + m_metric_after_compression(0), m_metric_total_allocated(0), + m_metric_total_read(0) { + } + + virtual ~BlobManager() { } + + // Allocates/create a new blob. + // This function returns the blob-id (the start address of the blob + // header) + // + // |flags| can be HAM_PARTIAL, kDisableCompression + uint64_t allocate(Context *context, ham_record_t *record, uint32_t flags); + + // Reads a blob and stores the data in @a record. + // @ref flags: either 0 or HAM_DIRECT_ACCESS + void read(Context *context, uint64_t blob_id, ham_record_t *record, + uint32_t flags, ByteArray *arena); + + // Retrieves the size of a blob + uint64_t get_blob_size(Context *context, uint64_t blob_id); + + // Overwrites an existing blob + // + // Will return an error if the blob does not exist. Returns the blob-id + // (the start address of the blob header) + uint64_t overwrite(Context *context, uint64_t old_blob_id, + ham_record_t *record, uint32_t flags); + + // Deletes an existing blob + void erase(Context *context, uint64_t blob_id, Page *page = 0, + uint32_t flags = 0); + + // Fills in the current metrics + void fill_metrics(ham_env_metrics_t *metrics) const { + metrics->blob_total_allocated = m_metric_total_allocated; + metrics->blob_total_read = m_metric_total_read; + metrics->record_bytes_before_compression = m_metric_before_compression; + metrics->record_bytes_after_compression = m_metric_after_compression; + } + + protected: + // Allocates/create a new blob. + // This function returns the blob-id (the start address of the blob + // header) + virtual uint64_t do_allocate(Context *context, ham_record_t *record, + uint32_t flags) = 0; + + // Reads a blob and stores the data in @a record. + // @ref flags: either 0 or HAM_DIRECT_ACCESS + virtual void do_read(Context *context, uint64_t blob_id, + ham_record_t *record, uint32_t flags, + ByteArray *arena) = 0; + + // Retrieves the size of a blob + virtual uint64_t do_get_blob_size(Context *context, + uint64_t blob_id) = 0; + + // Overwrites an existing blob + // + // Will return an error if the blob does not exist. Returns the blob-id + // (the start address of the blob header) + virtual uint64_t do_overwrite(Context *context, uint64_t old_blob_id, + ham_record_t *record, uint32_t flags) = 0; + + // Deletes an existing blob + virtual void do_erase(Context *context, uint64_t blob_id, + Page *page = 0, uint32_t flags = 0) = 0; + + // The Environment which created this BlobManager + LocalEnvironment *m_env; + + // Usage tracking - number of bytes before compression + uint64_t m_metric_before_compression; + + // Usage tracking - number of bytes after compression + uint64_t m_metric_after_compression; + + private: + // Usage tracking - number of blobs allocated + uint64_t m_metric_total_allocated; + + // Usage tracking - number of blobs read + uint64_t m_metric_total_read; +}; + +} // namespace hamsterdb + +#endif /* HAM_BLOB_MANAGER_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_disk.cc b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_disk.cc new file mode 100644 index 0000000000..231789774b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_disk.cc @@ -0,0 +1,637 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <algorithm> +#include <vector> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/dynamic_array.h" +#include "2device/device.h" +#include "3blob_manager/blob_manager_disk.h" +#include "3page_manager/page_manager.h" +#include "4db/db_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +uint64_t +DiskBlobManager::do_allocate(Context *context, ham_record_t *record, + uint32_t flags) +{ + uint8_t *chunk_data[2]; + uint32_t chunk_size[2]; + uint32_t page_size = m_env->config().page_size_bytes; + + PBlobHeader blob_header; + uint32_t alloc_size = sizeof(PBlobHeader) + record->size; + + // first check if we can add another blob to the last used page + Page *page = m_env->page_manager()->get_last_blob_page(context); + + PBlobPageHeader *header = 0; + uint64_t address = 0; + if (page) { + header = PBlobPageHeader::from_page(page); + // allocate space for the blob + if (!alloc_from_freelist(header, alloc_size, &address)) + page = 0; + else + address += page->get_address(); + } + + if (!address) { + // Allocate a new page. If the blob exceeds a page then allocate multiple + // pages that are directly next to each other. + uint32_t required_size = alloc_size + kPageOverhead; + uint32_t num_pages = required_size / page_size; + if (num_pages * page_size < required_size) + num_pages++; + + // |page| now points to the first page that was allocated, and + // the only one which has a header and a freelist + page = m_env->page_manager()->alloc_multiple_blob_pages(context, num_pages); + ham_assert(page->is_without_header() == false); + + // initialize the PBlobPageHeader + header = PBlobPageHeader::from_page(page); + header->initialize(); + header->set_num_pages(num_pages); + header->set_free_bytes((num_pages * page_size) - kPageOverhead); + + // and move the remaining space to the freelist, unless we span multiple + // pages (then the rest will be discarded) - TODO can we reuse it somehow? + if (num_pages == 1 + && kPageOverhead + alloc_size > 0 + && header->get_free_bytes() - alloc_size > 0) { + header->set_freelist_offset(0, kPageOverhead + alloc_size); + header->set_freelist_size(0, header->get_free_bytes() - alloc_size); + } + + address = page->get_address() + kPageOverhead; + ham_assert(check_integrity(header)); + } + + // addjust "free bytes" counter + ham_assert(header->get_free_bytes() >= alloc_size); + header->set_free_bytes(header->get_free_bytes() - alloc_size); + + // store the page id if it still has space left + if (header->get_free_bytes()) + m_env->page_manager()->set_last_blob_page(page); + else + m_env->page_manager()->set_last_blob_page(0); + + // initialize the blob header + blob_header.set_alloc_size(alloc_size); + blob_header.set_size(record->size); + blob_header.set_self(address); + + // PARTIAL WRITE + // + // Are there gaps at the beginning? If yes, then we'll fill with zeros + ByteArray zeroes; + if ((flags & HAM_PARTIAL) && (record->partial_offset > 0)) { + uint32_t gapsize = record->partial_offset; + + // first: write the header + chunk_data[0] = (uint8_t *)&blob_header; + chunk_size[0] = sizeof(blob_header); + write_chunks(context, page, address, chunk_data, chunk_size, 1); + + address += sizeof(blob_header); + + // now fill the gap; if the gap is bigger than a pagesize we'll + // split the gap into smaller chunks + while (gapsize) { + uint32_t size = gapsize >= page_size + ? page_size + : gapsize; + chunk_data[0] = (uint8_t *)zeroes.resize(size, 0); + chunk_size[0] = size; + write_chunks(context, page, address, chunk_data, chunk_size, 1); + gapsize -= size; + address += size; + } + + // now write the "real" data + chunk_data[0] = (uint8_t *)record->data; + chunk_size[0] = record->partial_size; + + write_chunks(context, page, address, chunk_data, chunk_size, 1); + address += record->partial_size; + } + else { + // not writing partially: write header and data, then we're done + chunk_data[0] = (uint8_t *)&blob_header; + chunk_size[0] = sizeof(blob_header); + chunk_data[1] = (uint8_t *)record->data; + chunk_size[1] = (flags & HAM_PARTIAL) + ? record->partial_size + : record->size; + + write_chunks(context, page, address, chunk_data, chunk_size, 2); + address += chunk_size[0] + chunk_size[1]; + } + + // store the blobid; it will be returned to the caller + uint64_t blobid = blob_header.get_self(); + + // PARTIAL WRITES: + // + // if we have gaps at the end of the blob: just append more chunks to + // fill these gaps. Since they can be pretty large we split them into + // smaller chunks if necessary. + if (flags & HAM_PARTIAL) { + if (record->partial_offset + record->partial_size < record->size) { + uint32_t gapsize = record->size + - (record->partial_offset + record->partial_size); + + // now fill the gap; if the gap is bigger than a pagesize we'll + // split the gap into smaller chunks + // + // we split this loop in two - the outer loop will allocate the + // memory buffer, thus saving some allocations + while (gapsize) { + uint32_t size = gapsize > page_size + ? page_size + : gapsize; + chunk_data[0] = (uint8_t *)zeroes.resize(size, 0); + chunk_size[0] = size; + write_chunks(context, page, address, chunk_data, chunk_size, 1); + gapsize -= size; + address += size; + } + } + } + + ham_assert(check_integrity(header)); + + return (blobid); +} + +void +DiskBlobManager::do_read(Context *context, uint64_t blobid, + ham_record_t *record, uint32_t flags, ByteArray *arena) +{ + Page *page; + + // first step: read the blob header + PBlobHeader *blob_header = (PBlobHeader *)read_chunk(context, 0, &page, + blobid, true); + + // sanity check + if (blob_header->get_self() != blobid) { + ham_log(("blob %lld not found", blobid)); + throw Exception(HAM_BLOB_NOT_FOUND); + } + + uint32_t blobsize = (uint32_t)blob_header->get_size(); + record->size = blobsize; + + if (flags & HAM_PARTIAL) { + if (record->partial_offset > blobsize) { + ham_trace(("partial offset is greater than the total record size")); + throw Exception(HAM_INV_PARAMETER); + } + if (record->partial_offset + record->partial_size > blobsize) + record->partial_size = blobsize = blobsize - record->partial_offset; + else + blobsize = record->partial_size; + } + + // empty blob? + if (!blobsize) { + record->data = 0; + record->size = 0; + return; + } + + // if the blob is in memory-mapped storage (and the user does not require + // a copy of the data): simply return a pointer + if ((flags & HAM_FORCE_DEEP_COPY) == 0 + && m_env->device()->is_mapped(blobid, blobsize) + && !(record->flags & HAM_RECORD_USER_ALLOC)) { + record->data = read_chunk(context, page, 0, + blobid + sizeof(PBlobHeader) + (flags & HAM_PARTIAL + ? record->partial_offset + : 0), true); + } + // otherwise resize the blob buffer and copy the blob data into the buffer + else { + if (!(record->flags & HAM_RECORD_USER_ALLOC)) { + arena->resize(blobsize); + record->data = arena->get_ptr(); + } + + copy_chunk(context, page, 0, + blobid + sizeof(PBlobHeader) + (flags & HAM_PARTIAL + ? record->partial_offset + : 0), + (uint8_t *)record->data, blobsize, true); + } +} + +uint64_t +DiskBlobManager::do_get_blob_size(Context *context, uint64_t blobid) +{ + // read the blob header + PBlobHeader *blob_header = (PBlobHeader *)read_chunk(context, 0, 0, blobid, + true); + + if (blob_header->get_self() != blobid) + throw Exception(HAM_BLOB_NOT_FOUND); + + return (blob_header->get_size()); +} + +uint64_t +DiskBlobManager::do_overwrite(Context *context, uint64_t old_blobid, + ham_record_t *record, uint32_t flags) +{ + PBlobHeader *old_blob_header, new_blob_header; + Page *page; + + uint32_t alloc_size = sizeof(PBlobHeader) + record->size; + + // first, read the blob header; if the new blob fits into the + // old blob, we overwrite the old blob (and add the remaining + // space to the freelist, if there is any) + old_blob_header = (PBlobHeader *)read_chunk(context, 0, &page, + old_blobid, false); + + // sanity check + ham_assert(old_blob_header->get_self() == old_blobid); + if (old_blob_header->get_self() != old_blobid) + throw Exception(HAM_BLOB_NOT_FOUND); + + // now compare the sizes; does the new data fit in the old allocated + // space? + if (alloc_size <= old_blob_header->get_alloc_size()) { + uint8_t *chunk_data[2]; + uint32_t chunk_size[2]; + + // setup the new blob header + new_blob_header.set_self(old_blob_header->get_self()); + new_blob_header.set_size(record->size); + new_blob_header.set_alloc_size(alloc_size); + new_blob_header.set_flags(0); // disable compression, just in case... + + // PARTIAL WRITE + // + // if we have a gap at the beginning, then we have to write the + // blob header and the blob data in two steps; otherwise we can + // write both immediately + if ((flags & HAM_PARTIAL) && (record->partial_offset)) { + chunk_data[0] = (uint8_t *)&new_blob_header; + chunk_size[0] = sizeof(new_blob_header); + write_chunks(context, page, new_blob_header.get_self(), + chunk_data, chunk_size, 1); + + chunk_data[0] = (uint8_t *)record->data; + chunk_size[0] = record->partial_size; + write_chunks(context, page, new_blob_header.get_self() + + sizeof(new_blob_header) + record->partial_offset, + chunk_data, chunk_size, 1); + } + else { + chunk_data[0] = (uint8_t *)&new_blob_header; + chunk_size[0] = sizeof(new_blob_header); + chunk_data[1] = (uint8_t *)record->data; + chunk_size[1] = (flags & HAM_PARTIAL) + ? record->partial_size + : record->size; + + write_chunks(context, page, new_blob_header.get_self(), + chunk_data, chunk_size, 2); + } + + // move remaining data to the freelist + if (alloc_size < old_blob_header->get_alloc_size()) { + PBlobPageHeader *header = PBlobPageHeader::from_page(page); + header->set_free_bytes(header->get_free_bytes() + + (uint32_t)(old_blob_header->get_alloc_size() - alloc_size)); + add_to_freelist(header, + (uint32_t)(old_blobid + alloc_size) - page->get_address(), + (uint32_t)old_blob_header->get_alloc_size() - alloc_size); + } + + // the old rid is the new rid + return (new_blob_header.get_self()); + } + + // if the new data is larger: allocate a fresh space for it + // and discard the old; 'overwrite' has become (delete + insert) now. + uint64_t new_blobid = allocate(context, record, flags); + erase(context, old_blobid, 0, 0); + + return (new_blobid); +} + +void +DiskBlobManager::do_erase(Context *context, uint64_t blobid, Page *page, + uint32_t flags) +{ + // fetch the blob header + PBlobHeader *blob_header = (PBlobHeader *)read_chunk(context, 0, &page, + blobid, false); + + // sanity check + ham_verify(blob_header->get_self() == blobid); + if (blob_header->get_self() != blobid) + throw Exception(HAM_BLOB_NOT_FOUND); + + // update the "free bytes" counter in the blob page header + PBlobPageHeader *header = PBlobPageHeader::from_page(page); + header->set_free_bytes(header->get_free_bytes() + + blob_header->get_alloc_size()); + + // if the page is now completely empty (all blobs were erased) then move + // it to the freelist + if (header->get_free_bytes() == (header->get_num_pages() + * m_env->config().page_size_bytes) - kPageOverhead) { + m_env->page_manager()->set_last_blob_page(0); + m_env->page_manager()->del(context, page, header->get_num_pages()); + header->initialize(); + return; + } + + // otherwise move the blob to the freelist + add_to_freelist(header, (uint32_t)(blobid - page->get_address()), + (uint32_t)blob_header->get_alloc_size()); +} + +bool +DiskBlobManager::alloc_from_freelist(PBlobPageHeader *header, uint32_t size, + uint64_t *poffset) +{ + ham_assert(check_integrity(header)); + + // freelist is not used if this is a multi-page blob + if (header->get_num_pages() > 1) + return (false); + + uint32_t count = header->get_freelist_entries(); + + for (uint32_t i = 0; i < count; i++) { + // exact match + if (header->get_freelist_size(i) == size) { + *poffset = header->get_freelist_offset(i); + header->set_freelist_offset(i, 0); + header->set_freelist_size(i, 0); + ham_assert(check_integrity(header)); + return (true); + } + // space in freelist is larger than what we need? return this space, + // make sure the remaining gap stays in the freelist + if (header->get_freelist_size(i) > size) { + *poffset = header->get_freelist_offset(i); + header->set_freelist_offset(i, (uint32_t)(*poffset + size)); + header->set_freelist_size(i, header->get_freelist_size(i) - size); + ham_assert(check_integrity(header)); + return (true); + } + } + + // there was no gap large enough for the blob + return (false); +} + +void +DiskBlobManager::add_to_freelist(PBlobPageHeader *header, + uint32_t offset, uint32_t size) +{ + ham_assert(check_integrity(header)); + + // freelist is not used if this is a multi-page blob + if (header->get_num_pages() > 1) + return; + + uint32_t count = header->get_freelist_entries(); + + // first try to collapse the blobs + for (uint32_t i = 0; i < count; i++) { + if (offset + size == header->get_freelist_offset(i)) { + header->set_freelist_offset(i, offset); + header->set_freelist_size(i, header->get_freelist_size(i) + size); + ham_assert(check_integrity(header)); + return; + } + if (header->get_freelist_offset(i) + header->get_freelist_size(i) + == offset) { + header->set_freelist_size(i, header->get_freelist_size(i) + size); + ham_assert(check_integrity(header)); + return; + } + } + + // otherwise store the blob in a new slot, if available + uint32_t smallest = 0; + for (uint32_t i = 0; i < count; i++) { + // slot is empty + if (header->get_freelist_size(i) == 0) { + header->set_freelist_offset(i, offset); + header->set_freelist_size(i, size); + ham_assert(check_integrity(header)); + return; + } + // otherwise look for the smallest entry + if (header->get_freelist_size(i) < header->get_freelist_size(smallest)) { + smallest = i; + continue; + } + } + + // overwrite the smallest entry? + if (size > header->get_freelist_size(smallest)) { + header->set_freelist_offset(smallest, offset); + header->set_freelist_size(smallest, size); + } + + ham_assert(check_integrity(header)); +} + +bool +DiskBlobManager::check_integrity(PBlobPageHeader *header) const +{ + ham_assert(header->get_num_pages() > 0); + + if (header->get_free_bytes() + kPageOverhead + > (m_env->config().page_size_bytes * header->get_num_pages())) { + ham_trace(("integrity violated: free bytes exceeds page boundary")); + return (false); + } + + // freelist is not used if this is a multi-page blob + if (header->get_num_pages() > 1) + return (true); + + uint32_t count = header->get_freelist_entries(); + uint32_t total_sizes = 0; + typedef std::pair<uint32_t, uint32_t> Range; + typedef std::vector<Range> RangeVec; + RangeVec ranges; + + for (uint32_t i = 0; i < count - 1; i++) { + if (header->get_freelist_size(i) == 0) { + ham_assert(header->get_freelist_offset(i) == 0); + continue; + } + total_sizes += header->get_freelist_size(i); + ranges.push_back(std::make_pair(header->get_freelist_offset(i), + header->get_freelist_size(i))); + } + + // the sum of freelist chunks must not exceed total number of free bytes + if (total_sizes > header->get_free_bytes()) { + ham_trace(("integrity violated: total freelist slots exceed free bytes")); + return (false); + } + + std::sort(ranges.begin(), ranges.end()); + + if (!ranges.empty()) { + for (uint32_t i = 0; i < ranges.size() - 1; i++) { + if (ranges[i].first + ranges[i].second + > m_env->config().page_size_bytes * header->get_num_pages()) { + ham_trace(("integrity violated: freelist slot %u/%u exceeds page", + ranges[i].first, ranges[i].second)); + return (false); + } + if (ranges[i].first + ranges[i].second > ranges[i + 1].first) { + ham_trace(("integrity violated: freelist slot %u/%u overlaps with %lu", + ranges[i].first, ranges[i].second, + ranges[i + 1].first)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + } + + return (true); +} + +void +DiskBlobManager::write_chunks(Context *context, Page *page, + uint64_t address, uint8_t **chunk_data, uint32_t *chunk_size, + uint32_t chunks) +{ + uint32_t page_size = m_env->config().page_size_bytes; + + // for each chunk... + for (uint32_t i = 0; i < chunks; i++) { + uint32_t size = chunk_size[i]; + uint8_t *data = chunk_data[i]; + + while (size) { + // get the page-id from this chunk + uint64_t pageid = address - (address % page_size); + + // is this the current page? if yes then continue working with this page, + // otherwise fetch the page + if (page && page->get_address() != pageid) + page = 0; + if (!page) + page = m_env->page_manager()->fetch(context, pageid, + PageManager::kNoHeader); + + uint32_t write_start = (uint32_t)(address - page->get_address()); + uint32_t write_size = (uint32_t)(page_size - write_start); + + // now write the data + if (write_size > size) + write_size = size; + memcpy(&page->get_raw_payload()[write_start], data, write_size); + page->set_dirty(true); + address += write_size; + data += write_size; + size -= write_size; + } + } +} + +void +DiskBlobManager::copy_chunk(Context *context, Page *page, Page **ppage, + uint64_t address, uint8_t *data, uint32_t size, + bool fetch_read_only) +{ + uint32_t page_size = m_env->config().page_size_bytes; + bool first_page = true; + + while (size) { + // get the page-id from this chunk + uint64_t pageid = address - (address % page_size); + + // is this the current page? if yes then continue working with this page, + // otherwise fetch the page + if (page && page->get_address() != pageid) + page = 0; + + if (!page) { + uint32_t flags = 0; + if (fetch_read_only) + flags |= PageManager::kReadOnly; + if (!first_page) + flags |= PageManager::kNoHeader; + page = m_env->page_manager()->fetch(context, pageid, flags); + } + + // now read the data from the page + uint32_t read_start = (uint32_t)(address - page->get_address()); + uint32_t read_size = (uint32_t)(page_size - read_start); + if (read_size > size) + read_size = size; + memcpy(data, &page->get_raw_payload()[read_start], read_size); + address += read_size; + data += read_size; + size -= read_size; + + first_page = false; + } + + if (ppage) + *ppage = page; +} + +uint8_t * +DiskBlobManager::read_chunk(Context *context, Page *page, Page **ppage, + uint64_t address, bool fetch_read_only) +{ + // get the page-id from this chunk + uint32_t page_size = m_env->config().page_size_bytes; + uint64_t pageid = address - (address % page_size); + + // is this the current page? if yes then continue working with this page, + // otherwise fetch the page + if (page && page->get_address() != pageid) + page = 0; + + if (!page) { + uint32_t flags = 0; + if (fetch_read_only) + flags |= PageManager::kReadOnly; + page = m_env->page_manager()->fetch(context, pageid, flags); + if (ppage) + *ppage = page; + } + + uint32_t read_start = (uint32_t)(address - page->get_address()); + return (&page->get_raw_payload()[read_start]); +} diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_disk.h b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_disk.h new file mode 100644 index 0000000000..7ec8b67d95 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_disk.h @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HAM_BLOB_MANAGER_DISK_H +#define HAM_BLOB_MANAGER_DISK_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3blob_manager/blob_manager.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +#include "1base/packstart.h" + +/* + * The header of a blob page + * + * Contains a fixed length freelist and a couter for the number of free + * bytes + */ +HAM_PACK_0 class HAM_PACK_1 PBlobPageHeader +{ + public: + void initialize() { + memset(this, 0, sizeof(PBlobPageHeader)); + } + + // Returns a PBlobPageHeader from a page + static PBlobPageHeader *from_page(Page *page) { + return (PBlobPageHeader *)&page->get_payload()[0]; + } + + // Returns the number of pages which are all managed by this header + uint32_t get_num_pages() const { + return (m_num_pages); + } + + // Sets the number of pages which are all managed by this header + void set_num_pages(uint32_t num_pages) { + m_num_pages = num_pages; + } + + // Returns the "free bytes" counter + uint32_t get_free_bytes() const { + return (m_free_bytes); + } + + // Sets the "free bytes" counter + void set_free_bytes(uint32_t free_bytes) { + m_free_bytes = free_bytes; + } + + // Returns the total number of freelist entries + uint8_t get_freelist_entries() const { + return (32); + } + + // Returns the offset of freelist entry |i| + uint32_t get_freelist_offset(uint32_t i) const { + return (m_freelist[i].offset); + } + + // Sets the offset of freelist entry |i| + void set_freelist_offset(uint32_t i, uint32_t offset) { + m_freelist[i].offset = offset; + } + + // Returns the size of freelist entry |i| + uint32_t get_freelist_size(uint32_t i) const { + return (m_freelist[i].size); + } + + // Sets the size of freelist entry |i| + void set_freelist_size(uint32_t i, uint32_t size) { + m_freelist[i].size = size; + } + + private: + // Number of "regular" pages for this blob; used for blobs exceeding + // a page size + uint32_t m_num_pages; + + // Number of free bytes in this page + uint32_t m_free_bytes; + + struct FreelistEntry { + uint32_t offset; + uint32_t size; + }; + + // The freelist - offset/size pairs in this page + FreelistEntry m_freelist[32]; +} HAM_PACK_2; + +#include "1base/packstop.h" + + +/* + * A BlobManager for disk-based databases + */ +class DiskBlobManager : public BlobManager +{ + enum { + // Overhead per page + kPageOverhead = Page::kSizeofPersistentHeader + sizeof(PBlobPageHeader) + }; + + public: + DiskBlobManager(LocalEnvironment *env) + : BlobManager(env) { + } + + protected: + // allocate/create a blob + // returns the blob-id (the start address of the blob header) + virtual uint64_t do_allocate(Context *context, ham_record_t *record, + uint32_t flags); + + // reads a blob and stores the data in |record|. The pointer |record.data| + // is backed by the |arena|, unless |HAM_RECORD_USER_ALLOC| is set. + // flags: either 0 or HAM_DIRECT_ACCESS + virtual void do_read(Context *context, uint64_t blobid, + ham_record_t *record, uint32_t flags, + ByteArray *arena); + + // retrieves the size of a blob + virtual uint64_t do_get_blob_size(Context *context, uint64_t blobid); + + // overwrite an existing blob + // + // will return an error if the blob does not exist + // returns the blob-id (the start address of the blob header) in |blobid| + virtual uint64_t do_overwrite(Context *context, uint64_t old_blobid, + ham_record_t *record, uint32_t flags); + + // delete an existing blob + virtual void do_erase(Context *context, uint64_t blobid, + Page *page = 0, uint32_t flags = 0); + + private: + friend class DuplicateManager; + friend struct BlobManagerFixture; + + // write a series of data chunks to storage at file offset 'addr'. + // + // The chunks are assumed to be stored in sequential order, adjacent + // to each other, i.e. as one long data strip. + void write_chunks(Context *context, Page *page, uint64_t addr, + uint8_t **chunk_data, uint32_t *chunk_size, + uint32_t chunks); + + // Same as above, but for reading chunks from the file. The data + // is copied to |data|. + void copy_chunk(Context *context, Page *page, Page **fpage, + uint64_t addr, uint8_t *data, uint32_t size, + bool fetch_read_only); + + // Same as |copy_chunk|, but does not copy the data + uint8_t *read_chunk(Context *context, Page *page, Page **fpage, + uint64_t addr, bool fetch_read_only); + + // adds a free chunk to the freelist + void add_to_freelist(PBlobPageHeader *header, uint32_t offset, + uint32_t size); + + // searches the freelist for a free chunk; if available, returns |true| + // and stores the offset in |poffset|. + bool alloc_from_freelist(PBlobPageHeader *header, uint32_t size, + uint64_t *poffset); + + // verifies the integrity of the freelist + bool check_integrity(PBlobPageHeader *header) const; +}; + +} // namespace hamsterdb + +#endif /* HAM_BLOB_MANAGER_DISK_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_factory.h b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_factory.h new file mode 100644 index 0000000000..129849c7ad --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_factory.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HAM_BLOB_MANAGER_FACTORY_H +#define HAM_BLOB_MANAGER_FACTORY_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3blob_manager/blob_manager_disk.h" +#include "3blob_manager/blob_manager_inmem.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct BlobManagerFactory { + // creates a new BlobManager instance depending on the flags + static BlobManager *create(LocalEnvironment *env, uint32_t flags) { + if (flags & HAM_IN_MEMORY) + return (new InMemoryBlobManager(env)); + else + return (new DiskBlobManager(env)); + } +}; + +} // namespace hamsterdb + +#endif /* HAM_BLOB_MANAGER_FACTORY_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_inmem.cc b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_inmem.cc new file mode 100644 index 0000000000..1044d815c5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_inmem.cc @@ -0,0 +1,148 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/dynamic_array.h" +#include "2device/device_inmem.h" +#include "3blob_manager/blob_manager_inmem.h" +#include "4db/db_local.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +uint64_t +InMemoryBlobManager::do_allocate(Context *context, ham_record_t *record, + uint32_t flags) +{ + // in-memory-database: the blobid is actually a pointer to the memory + // buffer, in which the blob (with the blob-header) is stored + uint8_t *p = (uint8_t *)m_env->device()->alloc(record->size + + sizeof(PBlobHeader)); + + // initialize the header + PBlobHeader *blob_header = (PBlobHeader *)p; + memset(blob_header, 0, sizeof(*blob_header)); + blob_header->set_self((uint64_t)PTR_TO_U64(p)); + blob_header->set_alloc_size(record->size + sizeof(PBlobHeader)); + blob_header->set_size(record->size); + + // do we have gaps? if yes, fill them with zeroes + if (flags & HAM_PARTIAL) { + uint8_t *s = p + sizeof(PBlobHeader); + if (record->partial_offset) + memset(s, 0, record->partial_offset); + memcpy(s + record->partial_offset, record->data, record->partial_size); + if (record->partial_offset + record->partial_size < record->size) + memset(s + record->partial_offset + record->partial_size, 0, + record->size - (record->partial_offset + record->partial_size)); + } + else { + memcpy(p + sizeof(PBlobHeader), record->data, record->size); + } + + return ((uint64_t)PTR_TO_U64(p)); +} + +void +InMemoryBlobManager::do_read(Context *context, uint64_t blobid, + ham_record_t *record, uint32_t flags, + ByteArray *arena) +{ + // in-memory-database: the blobid is actually a pointer to the memory + // buffer, in which the blob is stored + PBlobHeader *blob_header = (PBlobHeader *)U64_TO_PTR(blobid); + uint8_t *data = (uint8_t *)(U64_TO_PTR(blobid)) + sizeof(PBlobHeader); + + // when the database is closing, the header is already deleted + if (!blob_header) { + record->size = 0; + return; + } + + uint32_t blobsize = (uint32_t)blob_header->get_size(); + record->size = blobsize; + + if (flags & HAM_PARTIAL) { + if (record->partial_offset > blobsize) { + ham_trace(("partial offset is greater than the total record size")); + throw Exception(HAM_INV_PARAMETER); + } + if (record->partial_offset + record->partial_size > blobsize) + record->partial_size = blobsize = blobsize - record->partial_offset; + else + blobsize = record->partial_size; + } + + // empty blob? + if (!blobsize) { + record->data = 0; + record->size = 0; + } + else { + uint8_t *d = data; + if (flags & HAM_PARTIAL) + d += record->partial_offset; + + if ((flags & HAM_DIRECT_ACCESS) + && !(record->flags & HAM_RECORD_USER_ALLOC)) { + record->data = d; + } + else { + // resize buffer if necessary + if (!(record->flags & HAM_RECORD_USER_ALLOC)) { + arena->resize(blobsize); + record->data = arena->get_ptr(); + } + // and copy the data + memcpy(record->data, d, blobsize); + } + } +} + +uint64_t +InMemoryBlobManager::do_overwrite(Context *context, uint64_t old_blobid, + ham_record_t *record, uint32_t flags) +{ + // free the old blob, allocate a new blob (but if both sizes are equal, + // just overwrite the data) + PBlobHeader *phdr = (PBlobHeader *)U64_TO_PTR(old_blobid); + + if (phdr->get_size() == record->size) { + uint8_t *p = (uint8_t *)phdr; + if (flags & HAM_PARTIAL) { + memmove(p + sizeof(PBlobHeader) + record->partial_offset, + record->data, record->partial_size); + } + else { + memmove(p + sizeof(PBlobHeader), record->data, record->size); + } + return ((uint64_t)PTR_TO_U64(phdr)); + } + else { + uint64_t new_blobid = allocate(context, record, flags); + + InMemoryDevice *dev = (InMemoryDevice *)m_env->device(); + dev->release(phdr, (size_t)phdr->get_alloc_size()); + return (new_blobid); + } +} + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_inmem.h b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_inmem.h new file mode 100644 index 0000000000..3c5b19a9fa --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3blob_manager/blob_manager_inmem.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef HAM_BLOB_MANAGER_INMEM_H +#define HAM_BLOB_MANAGER_INMEM_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3blob_manager/blob_manager.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/** + * A BlobManager for in-memory blobs + */ +class InMemoryBlobManager : public BlobManager { + public: + InMemoryBlobManager(LocalEnvironment *env) + : BlobManager(env) { + } + + protected: + // Allocates/create a new blob + // This function returns the blob-id (the start address of the blob + // header) + virtual uint64_t do_allocate(Context *context, ham_record_t *record, + uint32_t flags); + + // Reads a blob and stores the data in |record| + // |flags|: either 0 or HAM_DIRECT_ACCESS + virtual void do_read(Context *context, uint64_t blobid, + ham_record_t *record, uint32_t flags, + ByteArray *arena); + + // Retrieves the size of a blob + virtual uint64_t do_get_blob_size(Context *context, uint64_t blobid) { + PBlobHeader *blob_header = (PBlobHeader *)U64_TO_PTR(blobid); + return ((uint32_t)blob_header->get_size()); + } + + // Overwrites an existing blob + // + // Will return an error if the blob does not exist. Returns the blob-id + // (the start address of the blob header) + virtual uint64_t do_overwrite(Context *context, uint64_t old_blobid, + ham_record_t *record, uint32_t flags); + + // Deletes an existing blob + virtual void do_erase(Context *context, uint64_t blobid, + Page *page = 0, uint32_t flags = 0) { + Memory::release((void *)U64_TO_PTR(blobid)); + } +}; + +} // namespace hamsterdb + +#endif /* HAM_BLOB_MANAGER_INMEM_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_check.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_check.cc new file mode 100644 index 0000000000..73098ce3e1 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_check.cc @@ -0,0 +1,325 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * btree verification + */ + +#include "0root/root.h" + +#include <set> +#include <string.h> +#include <stdio.h> +#if HAM_DEBUG +# include <sstream> +# include <fstream> +#endif + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "2page/page.h" +#include "3page_manager/page_manager.h" +#include "3page_manager/page_manager_test.h" +#include "3btree/btree_index.h" +#include "3btree/btree_node_proxy.h" +#include "4db/db.h" +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class BtreeCheckAction +{ + public: + // Constructor + BtreeCheckAction(BtreeIndex *btree, Context *context, uint32_t flags) + : m_btree(btree), m_context(context), m_flags(flags) { + } + + // This is the main method; it starts the verification. + void run() { + Page *page, *parent = 0; + uint32_t level = 0; + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + + ham_assert(m_btree->get_root_address() != 0); + + // get the root page of the tree + page = env->page_manager()->fetch(m_context, m_btree->get_root_address(), + PageManager::kReadOnly); + +#if HAM_DEBUG + if (m_flags & HAM_PRINT_GRAPH) { + m_graph << "digraph g {" << std::endl + << " graph [" << std::endl + << " rankdir = \"TD\"" << std::endl + << " ];" << std::endl + << " node [" << std::endl + << " fontsize = \"8\"" << std::endl + << " shape = \"ellipse\"" << std::endl + << " ];" << std::endl + << " edge [" << std::endl + << " ];" << std::endl; + } +#endif + + // for each level... + while (page) { + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + uint64_t ptr_down = node->get_ptr_down(); + + // verify the page and all its siblings + verify_level(parent, page, level); + parent = page; + + // follow the pointer to the smallest child + if (ptr_down) + page = env->page_manager()->fetch(m_context, ptr_down, + PageManager::kReadOnly); + else + page = 0; + + ++level; + } + +#if HAM_DEBUG + if (m_flags & HAM_PRINT_GRAPH) { + m_graph << "}" << std::endl; + + std::ofstream file; + file.open("graph.dot"); + file << m_graph.str(); + } +#endif + } + + private: + // Verifies a whole level in the tree - start with "page" and traverse + // the linked list of all the siblings + void verify_level(Page *parent, Page *page, uint32_t level) { + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + Page *child, *leftsib = 0; + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + // assert that the parent page's smallest item (item 0) is bigger + // than the largest item in this page + if (parent && node->get_left()) { + int cmp = compare_keys(db, page, 0, node->get_count() - 1); + if (cmp <= 0) { + ham_log(("integrity check failed in page 0x%llx: parent item " + "#0 <= item #%d\n", page->get_address(), + node->get_count() - 1)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + + m_children.clear(); + + while (page) { + // verify the page + verify_page(parent, leftsib, page, level); + + // follow the right sibling + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + if (node->get_right()) + child = env->page_manager()->fetch(m_context, + node->get_right(), PageManager::kReadOnly); + else + child = 0; + + if (leftsib) { + BtreeNodeProxy *leftnode = m_btree->get_node_from_page(leftsib); + ham_assert(leftnode->is_leaf() == node->is_leaf()); + } + + leftsib = page; + page = child; + } + } + + // Verifies a single page + void verify_page(Page *parent, Page *leftsib, Page *page, uint32_t level) { + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + +#if HAM_DEBUG + if (m_flags & HAM_PRINT_GRAPH) { + std::stringstream ss; + ss << "node" << page->get_address(); + m_graph << " \"" << ss.str() << "\" [" << std::endl + << " label = \""; + m_graph << "<fl>L|<fd>D|"; + for (uint32_t i = 0; i < node->get_count(); i++) { + m_graph << "<f" << i << ">" << i << "|"; + } + m_graph << "<fr>R\"" << std::endl + << " shape = \"record\"" << std::endl + << " ];" << std::endl; +#if 0 + // edge to the left sibling + if (node->get_left()) + m_graph << "\"" << ss.str() << "\":fl -> \"node" + << node->get_left() << "\":fr [" << std::endl + << " ];" << std::endl; + // to the right sibling + if (node->get_right()) + m_graph << " \"" << ss.str() << "\":fr -> \"node" + << node->get_right() << "\":fl [" << std::endl + << " ];" << std::endl; +#endif + // to ptr_down + if (node->get_ptr_down()) + m_graph << " \"" << ss.str() << "\":fd -> \"node" + << node->get_ptr_down() << "\":fd [" << std::endl + << " ];" << std::endl; + // to all children + if (!node->is_leaf()) { + for (uint32_t i = 0; i < node->get_count(); i++) { + m_graph << " \"" << ss.str() << "\":f" << i << " -> \"node" + << node->get_record_id(m_context, i) << "\":fd [" + << std::endl << " ];" << std::endl; + } + } + } +#endif + + if (node->get_count() == 0) { + // a rootpage can be empty! check if this page is the rootpage + if (page->get_address() == m_btree->get_root_address()) + return; + + // for internal nodes: ptr_down HAS to be set! + if (!node->is_leaf() && node->get_ptr_down() == 0) { + ham_log(("integrity check failed in page 0x%llx: empty page!\n", + page->get_address())); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + + // check if the largest item of the left sibling is smaller than + // the smallest item of this page + if (leftsib) { + BtreeNodeProxy *sibnode = m_btree->get_node_from_page(leftsib); + ham_key_t key1 = {0}; + ham_key_t key2 = {0}; + + node->check_integrity(m_context); + + if (node->get_count() > 0 && sibnode->get_count() > 0) { + sibnode->get_key(m_context, sibnode->get_count() - 1, + &m_barray1, &key1); + node->get_key(m_context, 0, &m_barray2, &key2); + + int cmp = node->compare(&key1, &key2); + if (cmp >= 0) { + ham_log(("integrity check failed in page 0x%llx: item #0 " + "< left sibling item #%d\n", page->get_address(), + sibnode->get_count() - 1)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + } + + if (node->get_count() == 1) + return; + + node->check_integrity(m_context); + + if (node->get_count() > 0) { + for (uint32_t i = 0; i < node->get_count() - 1; i++) { + int cmp = compare_keys(db, page, (uint32_t)i, (uint32_t)(i + 1)); + if (cmp >= 0) { + ham_log(("integrity check failed in page 0x%llx: item #%d " + "< item #%d", page->get_address(), i, i + 1)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + } + + // internal nodes: make sure that all record IDs are unique + if (!node->is_leaf()) { + if (m_children.find(node->get_ptr_down()) != m_children.end()) { + ham_log(("integrity check failed in page 0x%llx: record of item " + "-1 is not unique", page->get_address())); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + m_children.insert(node->get_ptr_down()); + + for (uint32_t i = 0; i < node->get_count(); i++) { + uint64_t child_id = node->get_record_id(m_context, i); + if (m_children.find(child_id) != m_children.end()) { + ham_log(("integrity check failed in page 0x%llx: record of item " + "#%d is not unique", page->get_address(), i)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + PageManagerTest test = env->page_manager()->test(); + if (test.is_page_free(child_id)) { + ham_log(("integrity check failed in page 0x%llx: record of item " + "#%d is in freelist", page->get_address(), i)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + m_children.insert(child_id); + } + } + } + + int compare_keys(LocalDatabase *db, Page *page, int lhs, int rhs) { + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + ham_key_t key1 = {0}; + ham_key_t key2 = {0}; + + node->get_key(m_context, lhs, &m_barray1, &key1); + node->get_key(m_context, rhs, &m_barray2, &key2); + + return (node->compare(&key1, &key2)); + } + + // The BtreeIndex on which we operate + BtreeIndex *m_btree; + + // The current Context + Context *m_context; + + // The flags as specified when calling ham_db_check_integrity + uint32_t m_flags; + + // ByteArrays to avoid frequent memory allocations + ByteArray m_barray1; + ByteArray m_barray2; + + // For checking uniqueness of record IDs on an internal level + std::set<uint64_t> m_children; + +#if HAM_DEBUG + // For printing the graph + std::ostringstream m_graph; +#endif +}; + +void +BtreeIndex::check_integrity(Context *context, uint32_t flags) +{ + BtreeCheckAction bta(this, context, flags); + bta.run(); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_cursor.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_cursor.cc new file mode 100644 index 0000000000..b66b58c645 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_cursor.cc @@ -0,0 +1,561 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "2page/page.h" +#include "3page_manager/page_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_cursor.h" +#include "3btree/btree_node_proxy.h" +#include "4cursor/cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +BtreeCursor::BtreeCursor(Cursor *parent) + : m_parent(parent), m_state(0), m_duplicate_index(0), + m_coupled_page(0), m_coupled_index(0), m_next_in_page(0), + m_previous_in_page(0) +{ + memset(&m_uncoupled_key, 0, sizeof(m_uncoupled_key)); + m_btree = parent->get_db()->btree_index(); +} + +void +BtreeCursor::set_to_nil() +{ + // uncoupled cursor: free the cached pointer + if (m_state == kStateUncoupled) + memset(&m_uncoupled_key, 0, sizeof(m_uncoupled_key)); + // coupled cursor: remove from page + else if (m_state == kStateCoupled) + remove_cursor_from_page(m_coupled_page); + + m_state = BtreeCursor::kStateNil; + m_duplicate_index = 0; +} + +void +BtreeCursor::uncouple_from_page(Context *context) +{ + if (m_state == kStateUncoupled || m_state == kStateNil) + return; + + ham_assert(m_coupled_page != 0); + + // get the btree-entry of this key + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + ham_assert(node->is_leaf()); + node->get_key(context, m_coupled_index, &m_uncoupled_arena, &m_uncoupled_key); + + // uncouple the page + remove_cursor_from_page(m_coupled_page); + + // set the state and the uncoupled key + m_state = BtreeCursor::kStateUncoupled; +} + +void +BtreeCursor::clone(BtreeCursor *other) +{ + m_duplicate_index = other->m_duplicate_index; + + // if the old cursor is coupled: couple the new cursor, too + if (other->m_state == kStateCoupled) { + couple_to_page(other->m_coupled_page, other->m_coupled_index); + } + // otherwise, if the src cursor is uncoupled: copy the key + else if (other->m_state == kStateUncoupled) { + memset(&m_uncoupled_key, 0, sizeof(m_uncoupled_key)); + + m_uncoupled_arena.copy(other->m_uncoupled_arena.get_ptr(), + other->m_uncoupled_arena.get_size()); + m_uncoupled_key.data = m_uncoupled_arena.get_ptr(); + m_uncoupled_key.size = m_uncoupled_arena.get_size(); + m_state = kStateUncoupled; + } + else { + set_to_nil(); + } +} + +void +BtreeCursor::overwrite(Context *context, ham_record_t *record, uint32_t flags) +{ + // uncoupled cursor: couple it + if (m_state == kStateUncoupled) + couple(context); + else if (m_state != kStateCoupled) + throw Exception(HAM_CURSOR_IS_NIL); + + // copy the key flags, and remove all flags concerning the key size + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + node->set_record(context, m_coupled_index, record, m_duplicate_index, + flags | HAM_OVERWRITE, 0); + + m_coupled_page->set_dirty(true); +} + +ham_status_t +BtreeCursor::move(Context *context, ham_key_t *key, ByteArray *key_arena, + ham_record_t *record, ByteArray *record_arena, uint32_t flags) +{ + ham_status_t st = 0; + + if (flags & HAM_CURSOR_FIRST) + st = move_first(context, flags); + else if (flags & HAM_CURSOR_LAST) + st = move_last(context, flags); + else if (flags & HAM_CURSOR_NEXT) + st = move_next(context, flags); + else if (flags & HAM_CURSOR_PREVIOUS) + st = move_previous(context, flags); + // no move, but cursor is nil? return error + else if (m_state == kStateNil) { + if (key || record) + return (HAM_CURSOR_IS_NIL); + else + return (0); + } + // no move, but cursor is not coupled? couple it + else if (m_state == kStateUncoupled) + couple(context); + + if (st) + return (st); + + ham_assert(m_state == kStateCoupled); + + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + ham_assert(node->is_leaf()); + + if (key) + node->get_key(context, m_coupled_index, key_arena, key); + + if (record) + node->get_record(context, m_coupled_index, record_arena, record, + flags, m_duplicate_index); + + return (0); +} + +ham_status_t +BtreeCursor::find(Context *context, ham_key_t *key, ByteArray *key_arena, + ham_record_t *record, ByteArray *record_arena, uint32_t flags) +{ + set_to_nil(); + + return (m_btree->find(context, m_parent, key, key_arena, record, + record_arena, flags)); +} + +bool +BtreeCursor::points_to(Context *context, Page *page, int slot) +{ + if (m_state == kStateUncoupled) + couple(context); + + if (m_state == kStateCoupled) + return (m_coupled_page == page && m_coupled_index == slot); + + return (false); +} + +bool +BtreeCursor::points_to(Context *context, ham_key_t *key) +{ + if (m_state == kStateUncoupled) { + if (m_uncoupled_key.size != key->size) + return (false); + return (0 == m_btree->compare_keys(key, &m_uncoupled_key)); + } + + if (m_state == kStateCoupled) { + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + return (node->equals(context, key, m_coupled_index)); + } + + ham_assert(!"shouldn't be here"); + return (false); +} + +ham_status_t +BtreeCursor::move_to_next_page(Context *context) +{ + LocalEnvironment *env = m_parent->get_db()->lenv(); + + // uncoupled cursor: couple it + if (m_state == kStateUncoupled) + couple(context); + else if (m_state != kStateCoupled) + return (HAM_CURSOR_IS_NIL); + + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + // if there is no right sibling then couple the cursor to the right-most + // key in the last page and return KEY_NOT_FOUND + if (!node->get_right()) { + couple_to_page(m_coupled_page, node->get_count() - 1, 0); + return (HAM_KEY_NOT_FOUND); + } + + Page *page = env->page_manager()->fetch(context, node->get_right(), + PageManager::kReadOnly); + couple_to_page(page, 0, 0); + return (0); +} + +int +BtreeCursor::get_record_count(Context *context, uint32_t flags) +{ + // uncoupled cursor: couple it + if (m_state == kStateUncoupled) + couple(context); + else if (m_state != kStateCoupled) + throw Exception(HAM_CURSOR_IS_NIL); + + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + return (node->get_record_count(context, m_coupled_index)); +} + +uint64_t +BtreeCursor::get_record_size(Context *context) +{ + // uncoupled cursor: couple it + if (m_state == kStateUncoupled) + couple(context); + else if (m_state != kStateCoupled) + throw Exception(HAM_CURSOR_IS_NIL); + + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + return (node->get_record_size(context, m_coupled_index, m_duplicate_index)); +} + +void +BtreeCursor::couple(Context *context) +{ + ham_assert(m_state == kStateUncoupled); + + /* + * Make a 'find' on the cached key; if we succeed, the cursor + * is automatically coupled. Since |find()| overwrites and modifies + * the cursor's state, keep a backup and restore it afterwards. + */ + int duplicate_index = m_duplicate_index; + ByteArray uncoupled_arena = m_uncoupled_arena; + ham_key_t uncoupled_key = m_uncoupled_key; + m_uncoupled_arena = ByteArray(); + + find(context, &uncoupled_key, 0, 0, 0, 0); + + m_duplicate_index = duplicate_index; + m_uncoupled_key = uncoupled_key; + m_uncoupled_arena = uncoupled_arena; + uncoupled_arena.disown(); // do not free when going out of scope +} + +ham_status_t +BtreeCursor::move_first(Context *context, uint32_t flags) +{ + LocalDatabase *db = m_parent->get_db(); + LocalEnvironment *env = db->lenv(); + + // get a NIL cursor + set_to_nil(); + + // get the root page + Page *page = env->page_manager()->fetch(context, + m_btree->get_root_address(), PageManager::kReadOnly); + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + // traverse down to the leafs + while (!node->is_leaf()) { + page = env->page_manager()->fetch(context, node->get_ptr_down(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + } + + // and to the next page that is NOT empty + while (node->get_count() == 0) { + if (node->get_right() == 0) + return (HAM_KEY_NOT_FOUND); + page = env->page_manager()->fetch(context, node->get_right(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + } + + // couple this cursor to the smallest key in this page + couple_to_page(page, 0, 0); + + return (0); +} + +ham_status_t +BtreeCursor::move_next(Context *context, uint32_t flags) +{ + LocalDatabase *db = m_parent->get_db(); + LocalEnvironment *env = db->lenv(); + + // uncoupled cursor: couple it + if (m_state == kStateUncoupled) + couple(context); + else if (m_state != kStateCoupled) + return (HAM_CURSOR_IS_NIL); + + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + + // if this key has duplicates: get the next duplicate; otherwise + // (and if there's no duplicate): fall through + if (!(flags & HAM_SKIP_DUPLICATES)) { + if (m_duplicate_index + < node->get_record_count(context, m_coupled_index) - 1) { + m_duplicate_index++; + return (0); + } + } + + // don't continue if ONLY_DUPLICATES is set + if (flags & HAM_ONLY_DUPLICATES) + return (HAM_KEY_NOT_FOUND); + + // if the index+1 is still in the coupled page, just increment the index + if (m_coupled_index + 1 < (int)node->get_count()) { + couple_to_page(m_coupled_page, m_coupled_index + 1, 0); + return (0); + } + + // otherwise uncouple the cursor and load the right sibling page + if (!node->get_right()) + return (HAM_KEY_NOT_FOUND); + + Page *page = env->page_manager()->fetch(context, node->get_right(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + + // if the right node is empty then continue searching for the next + // non-empty page + while (node->get_count() == 0) { + if (!node->get_right()) + return (HAM_KEY_NOT_FOUND); + page = env->page_manager()->fetch(context, node->get_right(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + } + + // couple this cursor to the smallest key in this page + couple_to_page(page, 0, 0); + + return (0); +} + +ham_status_t +BtreeCursor::move_previous(Context *context, uint32_t flags) +{ + LocalDatabase *db = m_parent->get_db(); + LocalEnvironment *env = db->lenv(); + + // uncoupled cursor: couple it + if (m_state == kStateUncoupled) + couple(context); + else if (m_state != kStateCoupled) + return (HAM_CURSOR_IS_NIL); + + BtreeNodeProxy *node = m_btree->get_node_from_page(m_coupled_page); + + // if this key has duplicates: get the previous duplicate; otherwise + // (and if there's no duplicate): fall through + if (!(flags & HAM_SKIP_DUPLICATES) && m_duplicate_index > 0) { + m_duplicate_index--; + return (0); + } + + // don't continue if ONLY_DUPLICATES is set + if (flags & HAM_ONLY_DUPLICATES) + return (HAM_KEY_NOT_FOUND); + + // if the index-1 is till in the coupled page, just decrement the index + if (m_coupled_index != 0) { + couple_to_page(m_coupled_page, m_coupled_index - 1); + } + // otherwise load the left sibling page + else { + if (!node->get_left()) + return (HAM_KEY_NOT_FOUND); + + Page *page = env->page_manager()->fetch(context, node->get_left(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + + // if the left node is empty then continue searching for the next + // non-empty page + while (node->get_count() == 0) { + if (!node->get_left()) + return (HAM_KEY_NOT_FOUND); + page = env->page_manager()->fetch(context, node->get_left(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + } + + // couple this cursor to the highest key in this page + couple_to_page(page, node->get_count() - 1); + } + m_duplicate_index = 0; + + // if duplicates are enabled: move to the end of the duplicate-list + if (!(flags & HAM_SKIP_DUPLICATES)) + m_duplicate_index = node->get_record_count(context, m_coupled_index) - 1; + + return (0); +} + +ham_status_t +BtreeCursor::move_last(Context *context, uint32_t flags) +{ + LocalDatabase *db = m_parent->get_db(); + LocalEnvironment *env = db->lenv(); + + // get a NIL cursor + set_to_nil(); + + // get the root page + if (!m_btree->get_root_address()) + return (HAM_KEY_NOT_FOUND); + + Page *page = env->page_manager()->fetch(context, + m_btree->get_root_address(), PageManager::kReadOnly); + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + // traverse down to the leafs + while (!node->is_leaf()) { + if (node->get_count() == 0) + page = env->page_manager()->fetch(context, node->get_ptr_down(), + PageManager::kReadOnly); + else + page = env->page_manager()->fetch(context, + node->get_record_id(context, node->get_count() - 1), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + } + + // and to the last page that is NOT empty + while (node->get_count() == 0) { + if (node->get_left() == 0) + return (HAM_KEY_NOT_FOUND); + page = env->page_manager()->fetch(context, node->get_left(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + } + + // couple this cursor to the largest key in this page + couple_to_page(page, node->get_count() - 1, 0); + + // if duplicates are enabled: move to the end of the duplicate-list + if (!(flags & HAM_SKIP_DUPLICATES)) + m_duplicate_index = node->get_record_count(context, m_coupled_index) - 1; + + return (0); +} + +void +BtreeCursor::couple_to_page(Page *page, uint32_t index) +{ + ham_assert(page != 0); + + if (m_state == kStateCoupled && m_coupled_page != page) + remove_cursor_from_page(m_coupled_page); + + m_coupled_index = index; + m_state = kStateCoupled; + if (m_coupled_page == page) + return; + + m_coupled_page = page; + + // add the cursor to the page + if (page->cursor_list()) { + m_next_in_page = page->cursor_list(); + m_previous_in_page = 0; + page->cursor_list()->m_previous_in_page = this; + } + page->set_cursor_list(this); +} + +void +BtreeCursor::remove_cursor_from_page(Page *page) +{ + BtreeCursor *n, *p; + + if (this == page->cursor_list()) { + n = m_next_in_page; + if (n) + n->m_previous_in_page = 0; + page->set_cursor_list(n); + } + else { + n = m_next_in_page; + p = m_previous_in_page; + if (p) + p->m_next_in_page = n; + if (n) + n->m_previous_in_page = p; + } + + m_coupled_page = 0; + m_next_in_page = 0; + m_previous_in_page = 0; +} + +void +BtreeCursor::uncouple_all_cursors(Context *context, Page *page, int start) +{ + bool skipped = false; + Cursor *cursors = page->cursor_list() + ? page->cursor_list()->get_parent() + : 0; + + while (cursors) { + BtreeCursor *btc = cursors->get_btree_cursor(); + BtreeCursor *next = btc->m_next_in_page; + + // ignore all cursors which are already uncoupled or which are + // coupled to a key in the Transaction + if (btc->m_state == kStateCoupled) { + // skip this cursor if its position is < start + if (btc->m_coupled_index < start) { + cursors = next ? next->m_parent : 0; + skipped = true; + continue; + } + + // otherwise: uncouple the cursor from the page + btc->uncouple_from_page(context); + } + + cursors = next ? next->m_parent : 0; + } + + if (!skipped) + page->set_cursor_list(0); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_cursor.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_cursor.h new file mode 100644 index 0000000000..1754371875 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_cursor.h @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * btree cursors + * + * A Btree-Cursor is an object which is used to traverse a Btree. + * It is a random access iterator. + * + * Btree-Cursors are used in Cursor structures as defined in cursor.h. But + * some routines use them directly, mostly for performance reasons. Over + * time these layers will be cleaned up and the separation will be improved. + * + * The cursor implementation is very fast. Most of the operations (i.e. + * move previous/next) will not cause any disk access but are O(1) and + * in-memory only. That's because a cursor is directly "coupled" to a + * btree page (Page) that resides in memory. If the page is removed + * from memory (i.e. because the cache decides that it needs to purge the + * cache, or if there's a page split) then the cursor is "uncoupled", and a + * copy of the current key is stored in the cursor. On first access, the + * cursor is "coupled" again and basically performs a normal lookup of the key. + * + * The three states of a BtreeCursor("nil", "coupled", "uncoupled") can be + * retrieved with the method get_state(), and can be modified with + * set_to_nil(), couple_to_page() and uncouple_from_page(). + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_CURSORS_H +#define HAM_BTREE_CURSORS_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/dynamic_array.h" +#include "1base/error.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class Cursor; +class BtreeIndex; +class Page; + +// +// The Cursor structure for a b+tree cursor +// +class BtreeCursor +{ + public: + enum { + // Cursor does not point to any key + kStateNil = 0, + // Cursor flag: the cursor is coupled + kStateCoupled = 1, + // Cursor flag: the cursor is uncoupled + kStateUncoupled = 2 + }; + + // Constructor + BtreeCursor(Cursor *parent = 0); + + // Destructor; asserts that the cursor is nil + ~BtreeCursor() { + ham_assert(m_state == kStateNil); + } + + // Returns the parent cursor + // TODO this should be private + Cursor *get_parent() { + return (m_parent); + } + + // Clones another BtreeCursor + void clone(BtreeCursor *other); + + // Returns the cursor's state (kStateCoupled, kStateUncoupled, kStateNil) + uint32_t get_state() const { + return (m_state); + } + + // Reset's the cursor's state and uninitializes it. After this call + // the cursor no longer points to any key. + void set_to_nil(); + + // Returns the page, index in this page and the duplicate index that this + // cursor is coupled to. This is used by Btree functions to optimize + // certain algorithms, i.e. when erasing the current key. + // Asserts that the cursor is coupled. + void get_coupled_key(Page **page, int *index = 0, + int *duplicate_index = 0) const { + ham_assert(m_state == kStateCoupled); + if (page) + *page = m_coupled_page; + if (index) + *index = m_coupled_index; + if (duplicate_index) + *duplicate_index = m_duplicate_index; + } + + // Returns the uncoupled key of this cursor. + // Asserts that the cursor is uncoupled. + ham_key_t *get_uncoupled_key() { + ham_assert(m_state == kStateUncoupled); + return (&m_uncoupled_key); + } + + // Couples the cursor to a key directly in a page. Also sets the + // duplicate index. + void couple_to_page(Page *page, uint32_t index, + int duplicate_index) { + couple_to_page(page, index); + m_duplicate_index = duplicate_index; + } + + // Returns the duplicate index that this cursor points to. + int get_duplicate_index() const { + return (m_duplicate_index); + } + + // Sets the duplicate key we're pointing to + void set_duplicate_index(int duplicate_index) { + m_duplicate_index = duplicate_index; + } + + // Uncouples the cursor + void uncouple_from_page(Context *context); + + // Returns true if a cursor points to this btree key + bool points_to(Context *context, Page *page, int slot); + + // Returns true if a cursor points to this external key + bool points_to(Context *context, ham_key_t *key); + + // Moves the btree cursor to the next page + ham_status_t move_to_next_page(Context *context); + + // Positions the cursor on a key and retrieves the record (if |record| + // is a valid pointer) + ham_status_t find(Context *context, ham_key_t *key, ByteArray *key_arena, + ham_record_t *record, ByteArray *record_arena, + uint32_t flags); + + // Moves the cursor to the first, last, next or previous element + ham_status_t move(Context *context, ham_key_t *key, ByteArray *key_arena, + ham_record_t *record, ByteArray *record_arena, + uint32_t flags); + + // Returns the number of records of the referenced key + int get_record_count(Context *context, uint32_t flags); + + // Overwrite the record of this cursor + void overwrite(Context *context, ham_record_t *record, uint32_t flags); + + // retrieves the record size of the current record + uint64_t get_record_size(Context *context); + + // Closes the cursor + void close() { + set_to_nil(); + } + + // Uncouples all cursors from a page + // This method is called whenever the page is deleted or becomes invalid + static void uncouple_all_cursors(Context *context, Page *page, + int start = 0); + + private: + // Sets the key we're pointing to - if the cursor is coupled. Also + // links the Cursor with |page| (and vice versa). + void couple_to_page(Page *page, uint32_t index); + + // Removes this cursor from a page + void remove_cursor_from_page(Page *page); + + // Couples the cursor to the current page/key + // Asserts that the cursor is uncoupled. After this call the cursor + // will be coupled. + void couple(Context *context); + + // move cursor to the very first key + ham_status_t move_first(Context *context, uint32_t flags); + + // move cursor to the very last key + ham_status_t move_last(Context *context, uint32_t flags); + + // move cursor to the next key + ham_status_t move_next(Context *context, uint32_t flags); + + // move cursor to the previous key + ham_status_t move_previous(Context *context, uint32_t flags); + + // the parent cursor + Cursor *m_parent; + + // The BtreeIndex instance + BtreeIndex *m_btree; + + // "coupled" or "uncoupled" states; coupled means that the + // cursor points into a Page object, which is in + // memory. "uncoupled" means that the cursor has a copy + // of the key on which it points (i.e. because the coupled page was + // flushed to disk and removed from the cache) + int m_state; + + // the id of the duplicate key to which this cursor is coupled + int m_duplicate_index; + + // for coupled cursors: the page we're pointing to + Page *m_coupled_page; + + // ... and the index of the key in that page + int m_coupled_index; + + // for uncoupled cursors: a copy of the key at which we're pointing + ham_key_t m_uncoupled_key; + + // a ByteArray which backs |m_uncoupled_key.data| + ByteArray m_uncoupled_arena; + + // Linked list of cursors which point to the same page + BtreeCursor *m_next_in_page, *m_previous_in_page; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_CURSORS_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_erase.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_erase.cc new file mode 100644 index 0000000000..1222cac8fe --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_erase.cc @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3page_manager/page_manager.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_stats.h" +#include "3btree/btree_index.h" +#include "3btree/btree_update.h" +#include "3btree/btree_node_proxy.h" +#include "4db/db.h" +#include "4cursor/cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* + * Erases key/value pairs from a btree + */ +class BtreeEraseAction : public BtreeUpdateAction +{ + public: + BtreeEraseAction(BtreeIndex *btree, Context *context, Cursor *cursor, + ham_key_t *key, int duplicate_index = 0, uint32_t flags = 0) + : BtreeUpdateAction(btree, context, cursor + ? cursor->get_btree_cursor() + : 0, duplicate_index), + m_key(key), m_flags(flags) { + if (m_cursor) + m_duplicate_index = m_cursor->get_duplicate_index() + 1; + } + + // This is the entry point for the erase operation + ham_status_t run() { + // Coupled cursor: try to remove the key directly from the page + if (m_cursor) { + if (m_cursor->get_state() == BtreeCursor::kStateCoupled) { + Page *coupled_page; + int coupled_index; + m_cursor->get_coupled_key(&coupled_page, &coupled_index); + + BtreeNodeProxy *node = m_btree->get_node_from_page(coupled_page); + ham_assert(node->is_leaf()); + + // Now try to delete the key. This can require a page split if the + // KeyList is not "delete-stable" (some compressed lists can + // grow when keys are deleted). + try { + remove_entry(coupled_page, 0, coupled_index); + } + catch (Exception &ex) { + if (ex.code != HAM_LIMITS_REACHED) + throw ex; + goto fall_through; + } + // TODO if the page is empty then ask the janitor to clean it up + return (0); + +fall_through: + m_cursor->uncouple_from_page(m_context); + } + + if (m_cursor->get_state() == BtreeCursor::kStateUncoupled) + m_key = m_cursor->get_uncoupled_key(); + } + + return (erase()); + } + + private: + ham_status_t erase() { + // traverse the tree to the leaf, splitting/merging nodes as required + Page *parent; + BtreeStatistics::InsertHints hints; + Page *page = traverse_tree(m_key, hints, &parent); + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + // we have reached the leaf; search the leaf for the key + int slot = node->find_exact(m_context, m_key); + if (slot < 0) { + m_btree->get_statistics()->erase_failed(); + return (HAM_KEY_NOT_FOUND); + } + + // remove the key from the leaf + return (remove_entry(page, parent, slot)); + } + + ham_status_t remove_entry(Page *page, Page *parent, int slot) { + LocalDatabase *db = m_btree->get_db(); + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + ham_assert(slot >= 0); + ham_assert(slot < (int)node->get_count()); + + // delete the record, but only on leaf nodes! internal nodes don't have + // records; they point to pages instead, and we do not want to delete + // those. + bool has_duplicates_left = false; + if (node->is_leaf()) { + // only delete a duplicate? + if (m_duplicate_index > 0) + node->erase_record(m_context, slot, m_duplicate_index - 1, false, + &has_duplicates_left); + else + node->erase_record(m_context, slot, 0, true, 0); + } + + page->set_dirty(true); + + // still got duplicates left? then adjust all cursors + if (node->is_leaf() && has_duplicates_left && db->cursor_list()) { + Cursor *cursors = db->cursor_list(); + BtreeCursor *btcur = cursors->get_btree_cursor(); + + int duplicate_index = + m_cursor + ? m_cursor->get_duplicate_index() + : m_duplicate_index; + + while (btcur) { + BtreeCursor *next = 0; + if (cursors->get_next()) { + cursors = cursors->get_next(); + next = cursors->get_btree_cursor(); + } + + if (btcur != m_cursor && btcur->points_to(m_context, page, slot)) { + if (btcur->get_duplicate_index() == duplicate_index) + btcur->set_to_nil(); + else if (btcur->get_duplicate_index() > duplicate_index) + btcur->set_duplicate_index(btcur->get_duplicate_index() - 1); + } + btcur = next; + } + // all cursors were adjusted, the duplicate was deleted. return + // to caller! + return (0); + } + + // no duplicates left, the key was deleted; all cursors pointing to + // this key are set to nil, all cursors pointing to a key in the same + // page are adjusted, if necessary + if (node->is_leaf() && !has_duplicates_left && db->cursor_list()) { + Cursor *cursors = db->cursor_list(); + BtreeCursor *btcur = cursors->get_btree_cursor(); + + /* 'nil' every cursor which points to the deleted key, and adjust + * other cursors attached to the same page */ + while (btcur) { + BtreeCursor *cur = btcur; + BtreeCursor *next = 0; + if (cursors->get_next()) { + cursors = cursors->get_next(); + next = cursors->get_btree_cursor(); + } + if (btcur != m_cursor && cur->points_to(m_context, page, slot)) + cur->set_to_nil(); + else if (btcur != m_cursor + && (cur->get_state() & BtreeCursor::kStateCoupled)) { + Page *coupled_page; + int coupled_slot; + cur->get_coupled_key(&coupled_page, &coupled_slot); + if (coupled_page == page && coupled_slot > slot) + cur->uncouple_from_page(m_context); + } + btcur = next; + } + } + + if (has_duplicates_left) + return (0); + + // We've reached the leaf; it's still possible that we have to + // split the page, therefore this case has to be handled + try { + node->erase(m_context, slot); + } + catch (Exception &ex) { + if (ex.code != HAM_LIMITS_REACHED) + throw ex; + + // Split the page in the middle. This will invalidate the |node| pointer + // and the |slot| of the key, therefore restart the whole operation + BtreeStatistics::InsertHints hints = {0}; + split_page(page, parent, m_key, hints); + return (erase()); + } + + return (0); + } + + // the key that is retrieved + ham_key_t *m_key; + + // flags of ham_db_erase() + uint32_t m_flags; +}; + +ham_status_t +BtreeIndex::erase(Context *context, Cursor *cursor, ham_key_t *key, + int duplicate, uint32_t flags) +{ + context->db = get_db(); + + BtreeEraseAction bea(this, context, cursor, key, duplicate, flags); + return (bea.run()); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_find.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_find.cc new file mode 100644 index 0000000000..05c99b5818 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_find.cc @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * btree searching + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3btree/btree_index.h" +#include "3btree/btree_cursor.h" +#include "3btree/btree_stats.h" +#include "3btree/btree_node_proxy.h" +#include "3page_manager/page_manager.h" +#include "4cursor/cursor.h" +#include "4db/db.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class BtreeFindAction +{ + public: + BtreeFindAction(BtreeIndex *btree, Context *context, Cursor *cursor, + ham_key_t *key, ByteArray *key_arena, + ham_record_t *record, ByteArray *record_arena, + uint32_t flags) + : m_btree(btree), m_context(context), m_cursor(0), m_key(key), + m_record(record), m_flags(flags), m_key_arena(key_arena), + m_record_arena(record_arena) { + if (cursor && cursor->get_btree_cursor()->get_parent()) + m_cursor = cursor->get_btree_cursor(); + } + + ham_status_t run() { + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + Page *page = 0; + int slot = -1; + BtreeNodeProxy *node = 0; + + BtreeStatistics *stats = m_btree->get_statistics(); + BtreeStatistics::FindHints hints = stats->get_find_hints(m_flags); + + if (hints.try_fast_track) { + /* + * see if we get a sure hit within this btree leaf; if not, revert to + * regular scan + * + * As this is a speed-improvement hint re-using recent material, the + * page should still sit in the cache, or we're using old info, which + * should be discarded. + */ + page = env->page_manager()->fetch(m_context, hints.leaf_page_addr, + PageManager::kOnlyFromCache + | PageManager::kReadOnly); + if (page) { + node = m_btree->get_node_from_page(page); + ham_assert(node->is_leaf()); + + uint32_t approx_match; + slot = m_btree->find_leaf(m_context, page, m_key, m_flags, + &approx_match); + + /* + * if we didn't hit a match OR a match at either edge, FAIL. + * A match at one of the edges is very risky, as this can also + * signal a match far away from the current node, so we need + * the full tree traversal then. + */ + if (approx_match || slot <= 0 || slot >= (int)node->get_count() - 1) + slot = -1; + + /* fall through */ + } + } + + uint32_t approx_match = 0; + + if (slot == -1) { + /* load the root page */ + page = env->page_manager()->fetch(m_context, + m_btree->get_root_address(), PageManager::kReadOnly); + + /* now traverse the root to the leaf nodes till we find a leaf */ + node = m_btree->get_node_from_page(page); + while (!node->is_leaf()) { + page = m_btree->find_child(m_context, page, m_key, + PageManager::kReadOnly, 0); + if (!page) { + stats->find_failed(); + return (HAM_KEY_NOT_FOUND); + } + + node = m_btree->get_node_from_page(page); + } + + /* check the leaf page for the key (shortcut w/o approx. matching) */ + if (m_flags == 0) { + slot = node->find_exact(m_context, m_key); + if (slot == -1) { + stats->find_failed(); + return (HAM_KEY_NOT_FOUND); + } + } + + /* check the leaf page for the key (long path w/ approx. matching), + * then fall through */ + slot = m_btree->find_leaf(m_context, page, m_key, m_flags, + &approx_match); + } + + if (slot == -1) { + // find the left sibling + if (node->get_left() > 0) { + page = env->page_manager()->fetch(m_context, node->get_left(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + slot = node->get_count() - 1; + approx_match = BtreeKey::kLower; + } + } + + else if (slot >= (int)node->get_count()) { + // find the right sibling + if (node->get_right() > 0) { + page = env->page_manager()->fetch(m_context, node->get_right(), + PageManager::kReadOnly); + node = m_btree->get_node_from_page(page); + slot = 0; + approx_match = BtreeKey::kGreater; + } + else + slot = -1; + } + + if (slot < 0) { + stats->find_failed(); + return (HAM_KEY_NOT_FOUND); + } + + ham_assert(node->is_leaf()); + + /* set the cursor-position to this key */ + if (m_cursor) { + m_cursor->couple_to_page(page, slot, 0); + } + + /* approx. match: patch the key flags */ + if (approx_match) { + ham_key_set_intflags(m_key, approx_match); + } + + /* no need to load the key if we have an exact match, or if KEY_DONT_LOAD + * is set: */ + if (m_key && approx_match && !(m_flags & Cursor::kSyncDontLoadKey)) { + node->get_key(m_context, slot, m_key_arena, m_key); + } + + if (m_record) { + node->get_record(m_context, slot, m_record_arena, m_record, m_flags); + } + + return (0); + } + + private: + // the current btree + BtreeIndex *m_btree; + + // The caller's Context + Context *m_context; + + // the current cursor + BtreeCursor *m_cursor; + + // the key that is retrieved + ham_key_t *m_key; + + // the record that is retrieved + ham_record_t *m_record; + + // flags of ham_db_find() + uint32_t m_flags; + + // allocator for the key data + ByteArray *m_key_arena; + + // allocator for the record data + ByteArray *m_record_arena; +}; + +ham_status_t +BtreeIndex::find(Context *context, Cursor *cursor, ham_key_t *key, + ByteArray *key_arena, ham_record_t *record, + ByteArray *record_arena, uint32_t flags) +{ + BtreeFindAction bfa(this, context, cursor, key, key_arena, record, + record_arena, flags); + return (bfa.run()); +} + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_flags.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_flags.h new file mode 100644 index 0000000000..e0d77d8ae0 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_flags.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_FLAGS_H +#define HAM_BTREE_FLAGS_H + +#include "0root/root.h" + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// A helper class wrapping key-related constants into a common namespace. +// This class does not contain any logic. +// +struct BtreeKey +{ + // persisted btree key flags; also used in combination with ham_key_t._flags + enum { + // key is extended with overflow area + kExtendedKey = 0x01, + + // PRO: key is compressed; the original size is stored in the payload + kCompressed = 0x08 + }; + + // flags used with the ham_key_t::_flags (note the underscore - this + // field is for INTERNAL USE!) + // + // Note: these flags should NOT overlap with the persisted flags above! + // + // As these flags NEVER will be persisted, they should be located outside + // the range of a uint16_t, i.e. outside the mask 0x0000ffff. + enum { + // Actual key is lower than the requested key + kLower = 0x00010000, + + // Actual key is greater than the requested key + kGreater = 0x00020000, + + // Actual key is an "approximate match" + kApproximate = (kLower | kGreater) + }; +}; + +// +// A helper class wrapping record-related constants into a common namespace. +// This class does not contain any logic. +// +struct BtreeRecord +{ + enum { + // record size < 8; length is encoded at byte[7] of key->ptr + kBlobSizeTiny = 0x01, + + // record size == 8; record is stored in key->ptr + kBlobSizeSmall = 0x02, + + // record size == 0; key->ptr == 0 + kBlobSizeEmpty = 0x04, + + // key has duplicates in an overflow area; this is the msb of 1 byte; + // the lower bits are the counter for the inline duplicate list + kExtendedDuplicates = 0x80 + }; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_FLAGS_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_base.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_base.h new file mode 100644 index 0000000000..d75d2a7be2 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_base.h @@ -0,0 +1,475 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Base class for btree node implementations + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_IMPL_BASE_H +#define HAM_BTREE_IMPL_BASE_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3btree/btree_node.h" +#include "3btree/btree_keys_base.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; + +template<typename KeyList, typename RecordList> +class BaseNodeImpl +{ + public: + // Constructor + BaseNodeImpl(Page *page) + : m_page(page), m_node(PBtreeNode::from_page(page)), + m_estimated_capacity(0), m_keys(page->get_db()), + m_records(page->get_db(), m_node) { + } + + // Returns the estimated page's capacity + size_t estimate_capacity() const { + return (m_estimated_capacity); + } + + // Checks this node's integrity + virtual void check_integrity(Context *context) const { + } + + // Returns a copy of a key and stores it in |dest| + void get_key(Context *context, int slot, ByteArray *arena, + ham_key_t *dest) { + // copy (or assign) the key data + m_keys.get_key(context, slot, arena, dest, true); + } + + // Returns the record size of a key or one of its duplicates + uint64_t get_record_size(Context *context, int slot, int duplicate_index) { + return (m_records.get_record_size(context, slot, duplicate_index)); + } + + // Returns the record counter of a key + int get_record_count(Context *context, int slot) { + return (m_records.get_record_count(context, slot)); + } + + // Returns the full record and stores it in |dest| + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, int duplicate_index) { + // copy the record data + m_records.get_record(context, slot, arena, record, + flags, duplicate_index); + } + + // Updates the record of a key + void set_record(Context *context, int slot, ham_record_t *record, + int duplicate_index, uint32_t flags, + uint32_t *new_duplicate_index) { + // automatically overwrite an existing key unless this is a + // duplicate operation + if ((flags & (HAM_DUPLICATE + | HAM_DUPLICATE + | HAM_DUPLICATE_INSERT_BEFORE + | HAM_DUPLICATE_INSERT_AFTER + | HAM_DUPLICATE_INSERT_FIRST + | HAM_DUPLICATE_INSERT_LAST)) == 0) + flags |= HAM_OVERWRITE; + + m_records.set_record(context, slot, duplicate_index, record, flags, + new_duplicate_index); + } + + // Erases the extended part of a key + void erase_extended_key(Context *context, int slot) { + m_keys.erase_extended_key(context, slot); + } + + // Erases the record + void erase_record(Context *context, int slot, int duplicate_index, + bool all_duplicates) { + m_records.erase_record(context, slot, duplicate_index, all_duplicates); + } + + // Erases a key + void erase(Context *context, int slot) { + size_t node_count = m_node->get_count(); + + m_keys.erase(context, node_count, slot); + m_records.erase(context, node_count, slot); + } + + // Inserts a new key + // + // Most KeyLists first calculate the slot of the new key, then insert + // the key at this slot. Both operations are separate from each other. + // However, compressed KeyLists can overwrite this behaviour and + // combine both calls into one to save performance. + template<typename Cmp> + PBtreeNode::InsertResult insert(Context *context, ham_key_t *key, + uint32_t flags, Cmp &comparator) { + PBtreeNode::InsertResult result(0, 0); + size_t node_count = m_node->get_count(); + + if (node_count == 0) + result.slot = 0; + else if (flags & PBtreeNode::kInsertPrepend) + result.slot = 0; + else if (flags & PBtreeNode::kInsertAppend) + result.slot = node_count; + else { + int cmp; + result.slot = find_lowerbound_impl(context, key, comparator, &cmp); + + /* insert the new key at the beginning? */ + if (result.slot == -1) { + result.slot = 0; + ham_assert(cmp != 0); + } + /* key exists already */ + else if (cmp == 0) { + result.status = HAM_DUPLICATE_KEY; + return (result); + } + /* if the new key is > than the slot key: move to the next slot */ + else if (cmp > 0) + result.slot++; + } + + // Uncouple the cursors. + // + // for custom inserts we have to uncouple all cursors, because the + // KeyList doesn't have access to the cursors in the page. In this + // case result.slot is 0. + if ((int)node_count > result.slot) + BtreeCursor::uncouple_all_cursors(context, m_page, result.slot); + + // make space for 1 additional element. + // only store the key data; flags and record IDs are set by the caller + result = m_keys.insert(context, node_count, key, flags, comparator, + result.slot); + m_records.insert(context, node_count, result.slot); + return (result); + } + + // Compares two keys using the supplied comparator + template<typename Cmp> + int compare(Context *context, const ham_key_t *lhs, + uint32_t rhs, Cmp &cmp) { + if (KeyList::kHasSequentialData) { + return (cmp(lhs->data, lhs->size, m_keys.get_key_data(rhs), + m_keys.get_key_size(rhs))); + } + else { + ham_key_t tmp = {0}; + m_keys.get_key(context, rhs, &m_arena, &tmp, false); + return (cmp(lhs->data, lhs->size, tmp.data, tmp.size)); + } + } + + // Searches the node for the key and returns the slot of this key + template<typename Cmp> + int find_child(Context *context, ham_key_t *key, Cmp &comparator, + uint64_t *precord_id, int *pcmp) { + int slot = find_lowerbound_impl(context, key, comparator, pcmp); + if (precord_id) { + if (slot == -1) + *precord_id = m_node->get_ptr_down(); + else + *precord_id = m_records.get_record_id(slot); + } + return (slot); + } + + // Searches the node for the key and returns the slot of this key + // - only for exact matches! + template<typename Cmp> + int find_exact(Context *context, ham_key_t *key, Cmp &comparator) { + int cmp = 0; + int r = find_exact_impl(context, key, comparator, &cmp); + return (cmp ? -1 : r); + } + + // Splits a node and moves parts of the current node into |other|, starting + // at the |pivot| slot + void split(Context *context, BaseNodeImpl<KeyList, RecordList> *other, + int pivot) { + size_t node_count = m_node->get_count(); + size_t other_node_count = other->m_node->get_count(); + + // + // if a leaf page is split then the pivot element must be inserted in + // the leaf page AND in the internal node. the internal node update + // is handled by the caller. + // + // in internal nodes the pivot element is only propagated to the + // parent node. the pivot element is skipped. + // + if (m_node->is_leaf()) { + m_keys.copy_to(pivot, node_count, other->m_keys, + other_node_count, 0); + m_records.copy_to(pivot, node_count, other->m_records, + other_node_count, 0); + } + else { + m_keys.copy_to(pivot + 1, node_count, other->m_keys, + other_node_count, 0); + m_records.copy_to(pivot + 1, node_count, other->m_records, + other_node_count, 0); + } + } + + // Returns true if the node requires a merge or a shift + bool requires_merge() const { + return (m_node->get_count() <= 3); + } + + // Merges this node with the |other| node + void merge_from(Context *context, + BaseNodeImpl<KeyList, RecordList> *other) { + size_t node_count = m_node->get_count(); + size_t other_node_count = other->m_node->get_count(); + + // shift items from the sibling to this page + if (other_node_count > 0) { + other->m_keys.copy_to(0, other_node_count, m_keys, + node_count, node_count); + other->m_records.copy_to(0, other_node_count, m_records, + node_count, node_count); + } + } + + // Reorganize this node; re-arranges capacities of KeyList and RecordList + // in order to free space and avoid splits + bool reorganize(Context *context, const ham_key_t *key) const { + return (false); + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + metrics->number_of_pages++; + metrics->number_of_keys += node_count; + + BtreeStatistics::update_min_max_avg(&metrics->keys_per_page, node_count); + + m_keys.fill_metrics(metrics, node_count); + m_records.fill_metrics(metrics, node_count); + } + + // Prints a slot to stdout (for debugging) + void print(Context *context, int slot) { + std::stringstream ss; + ss << " "; + m_keys.print(context, slot, ss); + ss << " -> "; + m_records.print(context, slot, ss); + std::cout << ss.str() << std::endl; + } + + // Returns the record id + uint64_t get_record_id(Context *context, int slot) const { + return (m_records.get_record_id(slot)); + } + + // Sets the record id + void set_record_id(Context *context, int slot, uint64_t ptr) { + m_records.set_record_id(slot, ptr); + } + + // The page we're operating on + Page *m_page; + + // The node we're operating on + PBtreeNode *m_node; + + // Capacity of this node (maximum number of key/record pairs that + // can be stored) + size_t m_estimated_capacity; + + // for accessing the keys + KeyList m_keys; + + // for accessing the records + RecordList m_records; + + private: + // Implementation of the find method for lower-bound matches. If there + // is no exact match then the lower bound is returned, and the compare value + // is returned in |*pcmp|. + template<typename Cmp> + int find_lowerbound_impl(Context *context, const ham_key_t *key, + Cmp &comparator, int *pcmp) { + switch ((int)KeyList::kSearchImplementation) { + case BaseKeyList::kBinaryLinear: + return (find_impl_binlin(context, key, comparator, pcmp)); + case BaseKeyList::kCustomSearch: + return (m_keys.find(context, m_node->get_count(), key, + comparator, pcmp)); + default: // BaseKeyList::kBinarySearch + return (find_impl_binary(context, key, comparator, pcmp)); + } + } + + // Implementation of the find method for exact matches. Supports a custom + // search implementation in the KeyList (i.e. for SIMD). + template<typename Cmp> + int find_exact_impl(Context *context, const ham_key_t *key, + Cmp &comparator, int *pcmp) { + switch ((int)KeyList::kSearchImplementation) { + case BaseKeyList::kBinaryLinear: + return (find_impl_binlin(context, key, comparator, pcmp)); + case BaseKeyList::kCustomSearch: + case BaseKeyList::kCustomExactImplementation: + return (m_keys.find(context, m_node->get_count(), key, + comparator, pcmp)); + default: // BaseKeyList::kBinarySearch + return (find_impl_binary(context, key, comparator, pcmp)); + } + } + + // Binary search + template<typename Cmp> + int find_impl_binary(Context *context, const ham_key_t *key, + Cmp &comparator, int *pcmp) { + size_t node_count = m_node->get_count(); + ham_assert(node_count > 0); + + int i, l = 0, r = (int)node_count; + int last = node_count + 1; + int cmp = -1; + + /* repeat till we found the key or the remaining range is so small that + * we rather perform a linear search (which is faster for small ranges) */ + while (r - l > 0) { + /* get the median item; if it's identical with the "last" item, + * we've found the slot */ + i = (l + r) / 2; + + if (i == last) { + ham_assert(i >= 0); + ham_assert(i < (int)node_count); + *pcmp = 1; + return (i); + } + + /* compare it against the key */ + cmp = compare(context, key, i, comparator); + + /* found it? */ + if (cmp == 0) { + *pcmp = cmp; + return (i); + } + /* if the key is bigger than the item: search "to the left" */ + else if (cmp < 0) { + if (r == 0) { + ham_assert(i == 0); + *pcmp = cmp; + return (-1); + } + r = i; + } + /* otherwise search "to the right" */ + else { + last = i; + l = i; + } + } + + *pcmp = cmp; + return (-1); + } + + // Binary search combined with linear search + template<typename Cmp> + int find_impl_binlin(Context *context, const ham_key_t *key, + Cmp &comparator, int *pcmp) { + size_t node_count = m_node->get_count(); + ham_assert(node_count > 0); + + int i, l = 0, r = (int)node_count; + int last = node_count + 1; + int cmp = -1; + + // Run a binary search, but fall back to linear search as soon as + // the remaining range is too small. Sets threshold to 0 if linear + // search is disabled for this KeyList. + int threshold = m_keys.get_linear_search_threshold(); + + /* repeat till we found the key or the remaining range is so small that + * we rather perform a linear search (which is faster for small ranges) */ + while (r - l > threshold) { + /* get the median item; if it's identical with the "last" item, + * we've found the slot */ + i = (l + r) / 2; + + if (i == last) { + ham_assert(i >= 0); + ham_assert(i < (int)node_count); + *pcmp = 1; + return (i); + } + + /* compare it against the key */ + cmp = compare(context, key, i, comparator); + + /* found it? */ + if (cmp == 0) { + *pcmp = cmp; + return (i); + } + /* if the key is bigger than the item: search "to the left" */ + else if (cmp < 0) { + if (r == 0) { + ham_assert(i == 0); + *pcmp = cmp; + return (-1); + } + r = i; + } + /* otherwise search "to the right" */ + else { + last = i; + l = i; + } + } + + // still here? then perform a linear search for the remaining range + ham_assert(r - l <= threshold); + return (m_keys.linear_search(l, r - l, key, comparator, pcmp)); + } + + // A memory arena for various tasks + ByteArray m_arena; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_IMPL_BASE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_default.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_default.h new file mode 100644 index 0000000000..0e7e5618cc --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_default.h @@ -0,0 +1,532 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Btree node layout for variable length keys/records and/or duplicates + * ==================================================================== + * + * This is the default hamsterdb layout. It is chosen for + * 1. variable length keys (with or without duplicates) + * 2. fixed length keys with duplicates + * + * Like the PAX layout implemented in btree_impl_pax.h, the layout implemented + * here stores key data and records separated from each other. This layout is + * more complex, because it is capable of resizing the KeyList and RecordList + * if the node becomes full. + * + * The flat memory layout looks like this: + * + * |Idx1|Idx2|...|Idxn|F1|F2|...|Fn|...(space)...|Key1|Key2|...|Keyn| + * + * ... where Idx<n> are the indices (of slot <n>) + * where F<n> are freelist entries + * where Key<n> is the key data of slot <n>. + * + * In addition, the first few bytes in the node store the following + * information: + * 0 (4 bytes): total capacity of index keys (used keys + freelist) + * 4 (4 bytes): number of used freelist entries + * 8 (4 bytes): offset for the next key at the end of the page + * + * In total, |capacity| contains the number of maximum keys (and index + * entries) that can be stored in the node. The number of used index keys + * is in |m_node->get_count()|. The number of used freelist entries is + * returned by |get_freelist_count()|. The freelist indices start directly + * after the key indices. The key space (with key data and records) starts at + * N * capacity, where |N| is the size of an index entry (the size depends + * on the actual btree configuration, i.e. whether key size is fixed, + * duplicates are used etc). + * + * If records have fixed length then all records of a key (with duplicates) + * are stored next to each other. If they have variable length then each of + * these records is stored with 1 byte for flags: + * Rec1|F1|Rec2|F2|... + * where Recn is an 8 bytes record-ID (offset in the file) OR inline record, + * and F1 is 1 byte for flags (kBlobSizeSmall etc). + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_IMPL_DEFAULT_H +#define HAM_BTREE_IMPL_DEFAULT_H + +#include "0root/root.h" + +#include <algorithm> +#include <iostream> +#include <vector> +#include <map> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_impl_base.h" +#include "3btree/btree_node.h" +#include "3btree/btree_visitor.h" +#include "4env/env_local.h" +#include "4db/db_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// A BtreeNodeProxy layout which can handle... +// +// 1. fixed length keys w/ duplicates +// 2. variable length keys w/ duplicates +// 3. variable length keys w/o duplicates +// +// Fixed length keys are stored sequentially and reuse the layout from pax. +// Same for the distinct RecordList (if duplicates are disabled). +// +template<typename KeyList, typename RecordList> +class DefaultNodeImpl : public BaseNodeImpl<KeyList, RecordList> +{ + // C++ does not allow access to members of base classes unless they're + // explicitly named; this typedef helps to make the code "less" ugly, + // but it still sucks that i have to use it + // + // http://stackoverflow.com/questions/1120833/derived-template-class-access-to-base-class-member-data + typedef BaseNodeImpl<KeyList, RecordList> P; + + // the type of |this| object + typedef DefaultNodeImpl<KeyList, RecordList> NodeType; + + enum { + // for capacity + kPayloadOffset = 4 + }; + + public: + // Constructor + DefaultNodeImpl(Page *page) + : BaseNodeImpl<KeyList, RecordList>(page) { + initialize(); + } + + // Checks the integrity of this node. Throws an exception if there is a + // violation. + virtual void check_integrity(Context *context) const { + size_t node_count = P::m_node->get_count(); + if (node_count == 0) + return; + + check_index_integrity(context, node_count); + } + + // Iterates all keys, calls the |visitor| on each + void scan(Context *context, ScanVisitor *visitor, uint32_t start, + bool distinct) { +#ifdef HAM_DEBUG + check_index_integrity(context, P::m_node->get_count()); +#endif + + // a distinct scan over fixed-length keys can be moved to the KeyList + if (KeyList::kSupportsBlockScans && distinct) { + P::m_keys.scan(context, visitor, start, P::m_node->get_count() - start); + return; + } + + // otherwise iterate over the keys, call visitor for each key + ham_key_t key = {0}; + ByteArray arena; + size_t node_count = P::m_node->get_count() - start; + + for (size_t i = start; i < node_count; i++) { + P::m_keys.get_key(context, i, &arena, &key, false); + (*visitor)(key.data, key.size, distinct + ? 1 + : P::get_record_count(context, i)); + } + } + + // Returns the full record and stores it in |dest| + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, int duplicate_index) { +#ifdef HAM_DEBUG + check_index_integrity(context, P::m_node->get_count()); +#endif + P::get_record(context, slot, arena, record, flags, duplicate_index); + } + + // Updates the record of a key + void set_record(Context *context, int slot, ham_record_t *record, + int duplicate_index, uint32_t flags, + uint32_t *new_duplicate_index) { + P::set_record(context, slot, record, duplicate_index, + flags, new_duplicate_index); +#ifdef HAM_DEBUG + check_index_integrity(context, P::m_node->get_count()); +#endif + } + + // Erases the record + void erase_record(Context *context, int slot, int duplicate_index, + bool all_duplicates) { + P::erase_record(context, slot, duplicate_index, all_duplicates); +#ifdef HAM_DEBUG + check_index_integrity(context, P::m_node->get_count()); +#endif + } + + // Erases a key + void erase(Context *context, int slot) { + P::erase(context, slot); +#ifdef HAM_DEBUG + check_index_integrity(context, P::m_node->get_count() - 1); +#endif + } + + // Returns true if |key| cannot be inserted because a split is required. + // This function will try to re-arrange the node in order for the new + // key to fit in. + bool requires_split(Context *context, const ham_key_t *key) { + size_t node_count = P::m_node->get_count(); + + // the node is empty? that's either because nothing was inserted yet, + // or because all keys were erased. For the latter case make sure + // that no garbage remains behind, otherwise it's possible that + // following inserts can fail + if (node_count == 0) { + P::m_records.vacuumize(node_count, true); + P::m_keys.vacuumize(node_count, true); + return (false); + } + + bool keys_require_split = P::m_keys.requires_split(node_count, key); + bool records_require_split = P::m_records.requires_split(node_count); + if (!keys_require_split && !records_require_split) + return (false); + + // first try to vaccumize the lists without rearranging them + if (keys_require_split) { + P::m_keys.vacuumize(node_count, false); + keys_require_split = P::m_keys.requires_split(node_count, key); + } + + if (records_require_split) { + P::m_records.vacuumize(node_count, false); + records_require_split = P::m_records.requires_split(node_count); + } + + if (!keys_require_split && !records_require_split) + return (false); + + // now adjust the ranges and the capacity + if (reorganize(context, key)) { +#ifdef HAM_DEBUG + check_index_integrity(context, node_count); +#endif + return (false); + } + +#ifdef HAM_DEBUG + check_index_integrity(context, node_count); +#endif + + // still here? then there's no way to avoid the split + BtreeIndex *bi = P::m_page->get_db()->btree_index(); + bi->get_statistics()->set_keylist_range_size(P::m_node->is_leaf(), + load_range_size()); + bi->get_statistics()->set_keylist_capacities(P::m_node->is_leaf(), + node_count); + return (true); + } + + // Splits this node and moves some/half of the keys to |other| + void split(Context *context, DefaultNodeImpl *other, int pivot) { + size_t node_count = P::m_node->get_count(); + +#ifdef HAM_DEBUG + check_index_integrity(context, node_count); + ham_assert(other->m_node->get_count() == 0); +#endif + + // make sure that the other node has enough free space + other->initialize(this); + + P::split(context, other, pivot); + + P::m_keys.vacuumize(pivot, true); + P::m_records.vacuumize(pivot, true); + +#ifdef HAM_DEBUG + check_index_integrity(context, pivot); + if (P::m_node->is_leaf()) + other->check_index_integrity(context, node_count - pivot); + else + other->check_index_integrity(context, node_count - pivot - 1); +#endif + } + + // Merges keys from |other| to this node + void merge_from(Context *context, DefaultNodeImpl *other) { + size_t node_count = P::m_node->get_count(); + + P::m_keys.vacuumize(node_count, true); + P::m_records.vacuumize(node_count, true); + + P::merge_from(context, other); + +#ifdef HAM_DEBUG + check_index_integrity(context, node_count + other->m_node->get_count()); +#endif + } + + // Adjusts the size of both lists; either increases it or decreases + // it (in order to free up space for variable length data). + // Returns true if |key| and an additional record can be inserted, or + // false if not; in this case the caller must perform a split. + bool reorganize(Context *context, const ham_key_t *key) { + size_t node_count = P::m_node->get_count(); + + // One of the lists must be resizable (otherwise they would be managed + // by the PaxLayout) + ham_assert(!KeyList::kHasSequentialData + || !RecordList::kHasSequentialData); + + // Retrieve the minimum sizes that both lists require to store their + // data + size_t capacity_hint; + size_t old_key_range_size = load_range_size(); + size_t key_range_size, record_range_size; + size_t required_key_range, required_record_range; + size_t usable_size = usable_range_size(); + required_key_range = P::m_keys.get_required_range_size(node_count) + + P::m_keys.get_full_key_size(key); + required_record_range = P::m_records.get_required_range_size(node_count) + + P::m_records.get_full_record_size(); + + uint8_t *p = P::m_node->get_data(); + p += sizeof(uint32_t); + + // no records? then there's no way to change the ranges. but maybe we + // can increase the capacity + if (required_record_range == 0) { + if (required_key_range > usable_size) + return (false); + P::m_keys.change_range_size(node_count, p, usable_size, + node_count + 5); + return (!P::m_keys.requires_split(node_count, key)); + } + + int remainder = usable_size + - (required_key_range + required_record_range); + if (remainder < 0) + return (false); + + // Now split the remainder between both lists + size_t additional_capacity = remainder + / (P::m_keys.get_full_key_size(0) + + P::m_records.get_full_record_size()); + if (additional_capacity == 0) + return (false); + + key_range_size = required_key_range + additional_capacity + * P::m_keys.get_full_key_size(0); + record_range_size = usable_size - key_range_size; + + ham_assert(key_range_size + record_range_size <= usable_size); + + // Check if the required record space is large enough, and make sure + // there is enough room for a new item + if (key_range_size > usable_size + || record_range_size > usable_size + || key_range_size == old_key_range_size + || key_range_size < required_key_range + || record_range_size < required_record_range + || key_range_size + record_range_size > usable_size) + return (false); + + capacity_hint = get_capacity_hint(key_range_size, record_range_size); + + // sanity check: make sure that the new capacity would be big + // enough for all the keys + if (capacity_hint > 0 && capacity_hint < node_count) + return (false); + + if (capacity_hint == 0) { + BtreeStatistics *bstats = P::m_page->get_db()->btree_index()->get_statistics(); + capacity_hint = bstats->get_keylist_capacities(P::m_node->is_leaf()); + } + + if (capacity_hint < node_count) + capacity_hint = node_count + 1; + + // Get a pointer to the data area and persist the new range size + // of the KeyList + store_range_size(key_range_size); + + // Now update the lists. If the KeyList grows then start with resizing + // the RecordList, otherwise the moved KeyList will overwrite the + // beginning of the RecordList. + if (key_range_size > old_key_range_size) { + P::m_records.change_range_size(node_count, p + key_range_size, + usable_size - key_range_size, + capacity_hint); + P::m_keys.change_range_size(node_count, p, key_range_size, + capacity_hint); + } + // And vice versa if the RecordList grows + else { + P::m_keys.change_range_size(node_count, p, key_range_size, + capacity_hint); + P::m_records.change_range_size(node_count, p + key_range_size, + usable_size - key_range_size, + capacity_hint); + } + + // make sure that the page is flushed to disk + P::m_page->set_dirty(true); + +#ifdef HAM_DEBUG + check_index_integrity(context, node_count); +#endif + + // finally check if the new space is sufficient for the new key + // TODO this shouldn't be required if the check above is implemented + // -> change to an assert, then return true + return (!P::m_records.requires_split(node_count) + && !P::m_keys.requires_split(node_count, key)); + } + + private: + // Initializes the node + void initialize(NodeType *other = 0) { + LocalDatabase *db = P::m_page->get_db(); + size_t usable_size = usable_range_size(); + + // initialize this page in the same way as |other| was initialized + if (other) { + size_t key_range_size = other->load_range_size(); + + // persist the range size + store_range_size(key_range_size); + uint8_t *p = P::m_node->get_data(); + p += sizeof(uint32_t); + + // create the KeyList and RecordList + P::m_keys.create(p, key_range_size); + P::m_records.create(p + key_range_size, + usable_size - key_range_size); + } + // initialize a new page from scratch + else if ((P::m_node->get_count() == 0 + && !(db->get_flags() & HAM_READ_ONLY))) { + size_t key_range_size; + size_t record_range_size; + + // if yes then ask the btree for the default range size (it keeps + // track of the average range size of older pages). + BtreeStatistics *bstats = db->btree_index()->get_statistics(); + key_range_size = bstats->get_keylist_range_size(P::m_node->is_leaf()); + + // no data so far? then come up with a good default + if (key_range_size == 0) { + // no records? then assign the full range to the KeyList + if (P::m_records.get_full_record_size() == 0) { + key_range_size = usable_size; + } + // Otherwise split the range between both lists + else { + size_t capacity = usable_size + / (P::m_keys.get_full_key_size(0) + + P::m_records.get_full_record_size()); + key_range_size = capacity * P::m_keys.get_full_key_size(0); + } + } + + record_range_size = usable_size - key_range_size; + + ham_assert(key_range_size + record_range_size <= usable_size); + + // persist the key range size + store_range_size(key_range_size); + uint8_t *p = P::m_node->get_data(); + p += sizeof(uint32_t); + + // and create the lists + P::m_keys.create(p, key_range_size); + P::m_records.create(p + key_range_size, record_range_size); + + P::m_estimated_capacity = key_range_size + / (size_t)P::m_keys.get_full_key_size(); + } + // open a page; read initialization parameters from persisted storage + else { + size_t key_range_size = load_range_size(); + size_t record_range_size = usable_size - key_range_size; + uint8_t *p = P::m_node->get_data(); + p += sizeof(uint32_t); + + P::m_keys.open(p, key_range_size, P::m_node->get_count()); + P::m_records.open(p + key_range_size, record_range_size, + P::m_node->get_count()); + + P::m_estimated_capacity = key_range_size + / (size_t)P::m_keys.get_full_key_size(); + } + } + + // Try to get a clue about the capacity of the lists; this will help + // those lists with an UpfrontIndex to better arrange their layout + size_t get_capacity_hint(size_t key_range_size, size_t record_range_size) { + if (KeyList::kHasSequentialData) + return (key_range_size / P::m_keys.get_full_key_size()); + if (RecordList::kHasSequentialData && P::m_records.get_full_record_size()) + return (record_range_size / P::m_records.get_full_record_size()); + return (0); + } + + // Checks the integrity of the key- and record-ranges. Throws an exception + // if there's a problem. + void check_index_integrity(Context *context, size_t node_count) const { + P::m_keys.check_integrity(context, node_count); + P::m_records.check_integrity(context, node_count); + } + + // Returns the usable page size that can be used for actually + // storing the data + size_t usable_range_size() const { + return (Page::usable_page_size(P::m_page->get_db()->lenv()->config().page_size_bytes) + - kPayloadOffset + - PBtreeNode::get_entry_offset() + - sizeof(uint32_t)); + } + + // Persists the KeyList's range size + void store_range_size(size_t key_range_size) { + uint8_t *p = P::m_node->get_data(); + *(uint32_t *)p = (uint32_t)key_range_size; + } + + // Load the stored KeyList's range size + size_t load_range_size() const { + uint8_t *p = P::m_node->get_data(); + return (*(uint32_t *)p); + } +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_IMPL_DEFAULT_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_pax.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_pax.h new file mode 100644 index 0000000000..3a87f1c914 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_impl_pax.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Btree node layout for fixed length keys WITHOUT duplicates + * ========================================================== + * + * This layout supports fixed length keys and fixed length records. It does + * not support duplicates and extended keys. Keys and records are always + * inlined, but records can refer to blobs (in this case the "fixed length" + * record is the 8 byte record ID). + * + * Unlike the academic PAX paper, which stored multiple columns in one page, + * hamsterdb stores only one column (= database) in a page, but keys and + * records are separated from each other. The keys (flags + key data) are + * stored in the beginning of the page, the records start somewhere in the + * middle (the exact start position depends on key size, page size and other + * parameters). + * + * This layout's implementation is relatively simple because the offset + * of the key data and record data is easy to calculate since all keys + * and records have the same size. + * + * This separation of keys and records allows a more compact layout and a + * high density of the key data, which better exploits CPU caches and allows + * very tight loops when searching through the keys. + * + * This layout has two incarnations: + * 1. Fixed length keys, fixed length inline records + * -> does not require additional flags + * 2. Fixed length keys, variable length records (8 byte record id) + * -> requires a 1 byte flag per key + * + * The flat memory layout looks like this: + * + * |Flag1|Flag2|...|Flagn|...|Key1|Key2|...|Keyn|...|Rec1|Rec2|...|Recn| + * + * Flags are optional, as described above. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_IMPL_PAX_H +#define HAM_BTREE_IMPL_PAX_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_node.h" +#include "3btree/btree_impl_base.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// A BtreeNodeProxy layout which stores key data, key flags and +// and the record pointers in a PAX style layout. +// +template<typename KeyList, typename RecordList> +class PaxNodeImpl : public BaseNodeImpl<KeyList, RecordList> +{ + // C++ does not allow access to members of base classes unless they're + // explicitly named; this typedef helps to make the code "less" ugly, + // but it still sucks that i have to use it + // + // http://stackoverflow.com/questions/1120833/derived-template-class-access-to-base-class-member-data + typedef BaseNodeImpl<KeyList, RecordList> P; + + public: + // Constructor + PaxNodeImpl(Page *page) + : BaseNodeImpl<KeyList, RecordList>(page) { + initialize(); + } + + // Iterates all keys, calls the |visitor| on each + void scan(Context *context, ScanVisitor *visitor, uint32_t start, + bool distinct) { + P::m_keys.scan(context, visitor, start, P::m_node->get_count() - start); + } + + // Returns true if |key| cannot be inserted because a split is required + bool requires_split(Context *context, const ham_key_t *key) const { + return (P::m_node->get_count() >= P::m_estimated_capacity); + } + + private: + void initialize() { + uint32_t usable_nodesize + = Page::usable_page_size(P::m_page->get_db()->lenv()->config().page_size_bytes) + - PBtreeNode::get_entry_offset(); + size_t ks = P::m_keys.get_full_key_size(); + size_t rs = P::m_records.get_full_record_size(); + size_t capacity = usable_nodesize / (ks + rs); + + uint8_t *p = P::m_node->get_data(); + if (P::m_node->get_count() == 0) { + P::m_keys.create(&p[0], capacity * ks); + P::m_records.create(&p[capacity * ks], capacity * rs); + } + else { + size_t key_range_size = capacity * ks; + size_t record_range_size = capacity * rs; + + P::m_keys.open(p, key_range_size, P::m_node->get_count()); + P::m_records.open(p + key_range_size, record_range_size, + P::m_node->get_count()); + } + + P::m_estimated_capacity = capacity; + } +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_IMPL_PAX_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index.cc new file mode 100644 index 0000000000..a934ba441b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index.cc @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "2page/page.h" +#include "3page_manager/page_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_index_factory.h" +#include "3btree/btree_node_proxy.h" +#include "4db/db.h" +#include "4env/env.h" +#include "4cursor/cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +uint64_t BtreeIndex::ms_btree_smo_split = 0; +uint64_t BtreeIndex::ms_btree_smo_merge = 0; +uint64_t BtreeIndex::ms_btree_smo_shift = 0; + +BtreeIndex::BtreeIndex(LocalDatabase *db, PBtreeHeader *btree_header, + uint32_t flags, uint32_t key_type, uint32_t key_size) + : m_db(db), m_key_size(0), m_key_type(key_type), m_rec_size(0), + m_btree_header(btree_header), m_flags(flags), m_root_address(0) +{ + m_leaf_traits = BtreeIndexFactory::create(db, flags, key_type, + key_size, true); + m_internal_traits = BtreeIndexFactory::create(db, flags, key_type, + key_size, false); +} + +void +BtreeIndex::create(Context *context, uint16_t key_type, uint32_t key_size, + uint32_t rec_size) +{ + ham_assert(key_size != 0); + + /* allocate a new root page */ + Page *root = m_db->lenv()->page_manager()->alloc(context, + Page::kTypeBroot, PageManager::kClearWithZero); + + // initialize the new page + PBtreeNode *node = PBtreeNode::from_page(root); + node->set_flags(PBtreeNode::kLeafNode); + + m_key_size = key_size; + m_key_type = key_type; + m_rec_size = rec_size; + m_root_address = root->get_address(); + + flush_descriptor(context); +} + +void +BtreeIndex::open() +{ + uint64_t rootadd; + uint16_t key_size; + uint16_t key_type; + uint32_t flags; + uint32_t rec_size; + + key_size = m_btree_header->get_key_size(); + key_type = m_btree_header->get_key_type(); + rec_size = m_btree_header->get_record_size(); + rootadd = m_btree_header->get_root_address(); + flags = m_btree_header->get_flags(); + + ham_assert(key_size > 0); + ham_assert(rootadd > 0); + + m_root_address = rootadd; + m_key_size = key_size; + m_key_type = key_type; + m_flags = flags; + m_rec_size = rec_size; +} + +void +BtreeIndex::set_record_compression(Context *context, int algo) +{ + m_btree_header->set_record_compression(algo); + flush_descriptor(context); +} + +int +BtreeIndex::get_record_compression() +{ + return (m_btree_header->get_record_compression()); +} + +void +BtreeIndex::set_key_compression(Context *context, int algo) +{ + m_btree_header->set_key_compression(algo); + flush_descriptor(context); +} + +int +BtreeIndex::get_key_compression() +{ + return (m_btree_header->get_key_compression()); +} + +void +BtreeIndex::flush_descriptor(Context *context) +{ + if (m_db->get_flags() & HAM_READ_ONLY) + return; + + m_btree_header->set_dbname(m_db->name()); + m_btree_header->set_key_size(get_key_size()); + m_btree_header->set_rec_size(get_record_size()); + m_btree_header->set_key_type(get_key_type()); + m_btree_header->set_root_address(get_root_address()); + m_btree_header->set_flags(get_flags()); +} + +Page * +BtreeIndex::find_child(Context *context, Page *page, const ham_key_t *key, + uint32_t page_manager_flags, int *idxptr) +{ + BtreeNodeProxy *node = get_node_from_page(page); + + // make sure that we're not in a leaf page, and that the + // page is not empty + ham_assert(node->get_ptr_down() != 0); + + uint64_t record_id; + int slot = node->find_child(context, (ham_key_t *)key, &record_id); + + if (idxptr) + *idxptr = slot; + + return (m_db->lenv()->page_manager()->fetch(context, + record_id, page_manager_flags)); +} + +int +BtreeIndex::find_leaf(Context *context, Page *page, ham_key_t *key, + uint32_t flags, uint32_t *approx_match) +{ + *approx_match = 0; + + /* ensure the approx flag is NOT set by anyone yet */ + BtreeNodeProxy *node = get_node_from_page(page); + if (node->get_count() == 0) + return (-1); + + int cmp; + int slot = node->find_child(context, key, 0, &cmp); + + /* successfull match */ + if (cmp == 0 && (flags == 0 || flags & HAM_FIND_EXACT_MATCH)) + return (slot); + + /* approx. matching: smaller key is required */ + if (flags & HAM_FIND_LT_MATCH) { + if (cmp == 0 && (flags & HAM_FIND_GT_MATCH)) { + *approx_match = BtreeKey::kLower; + return (slot + 1); + } + + if (slot < 0 && (flags & HAM_FIND_GT_MATCH)) { + *approx_match = BtreeKey::kGreater; + return (0); + } + *approx_match = BtreeKey::kLower; + if (cmp <= 0) + return (slot - 1); + return (slot); + } + + /* approx. matching: greater key is required */ + if (flags & HAM_FIND_GT_MATCH) { + *approx_match = BtreeKey::kGreater; + return (slot + 1); + } + + return (cmp ? -1 : slot); +} + +// +// visitor object for estimating / counting the number of keys +/// +class CalcKeysVisitor : public BtreeVisitor { + public: + CalcKeysVisitor(LocalDatabase *db, bool distinct) + : m_db(db), m_distinct(distinct), m_count(0) { + } + + virtual bool is_read_only() const { + return (true); + } + + virtual void operator()(Context *context, BtreeNodeProxy *node) { + size_t node_count = node->get_count(); + + if (m_distinct + || (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) == 0) { + m_count += node_count; + return; + } + + for (size_t i = 0; i < node_count; i++) + m_count += node->get_record_count(context, i); + } + + uint64_t get_result() const { + return (m_count); + } + + private: + LocalDatabase *m_db; + bool m_distinct; + uint64_t m_count; +}; + +uint64_t +BtreeIndex::count(Context *context, bool distinct) +{ + CalcKeysVisitor visitor(m_db, distinct); + visit_nodes(context, visitor, false); + return (visitor.get_result()); +} + +// +// visitor object to free all allocated blobs +/// +class FreeBlobsVisitor : public BtreeVisitor { + public: + virtual void operator()(Context *context, BtreeNodeProxy *node) { + node->remove_all_entries(context); + } + + virtual bool is_read_only() const { + return (false); + } +}; + +void +BtreeIndex::release(Context *context) +{ + FreeBlobsVisitor visitor; + visit_nodes(context, visitor, true); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index.h new file mode 100644 index 0000000000..f325f7915f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index.h @@ -0,0 +1,455 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_INDEX_H +#define HAM_BTREE_INDEX_H + +#include "0root/root.h" + +#include <algorithm> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/abi.h" +#include "1base/dynamic_array.h" +#include "3btree/btree_cursor.h" +#include "3btree/btree_stats.h" +#include "3btree/btree_node.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; + +#include "1base/packstart.h" + +// +// The persistent btree index descriptor. This structure manages the +// persistent btree metadata. +// +HAM_PACK_0 class HAM_PACK_1 PBtreeHeader +{ + public: + PBtreeHeader() { + memset(this, 0, sizeof(*this)); + } + + // Returns the database name + uint16_t get_dbname() const { + return (m_dbname); + } + + // Sets the database name + void set_dbname(uint16_t name) { + m_dbname = name; + } + + // Returns the btree's max. key_size + size_t get_key_size() const { + return (m_key_size); + } + + // Sets the btree's max. key_size + void set_key_size(uint16_t key_size) { + m_key_size = key_size; + } + + // Returns the record size (or 0 if none was specified) + uint32_t get_record_size() const { + return (m_rec_size); + } + + // Sets the record size + void set_rec_size(uint32_t rec_size) { + m_rec_size = rec_size; + } + + // Returns the btree's key type + uint16_t get_key_type() const { + return (m_key_type); + } + + // Sets the btree's key type + void set_key_type(uint16_t key_type) { + m_key_type = key_type; + } + + // Returns the address of the btree's root page. + uint64_t get_root_address() const { + return (m_root_address); + } + + // Sets the address of the btree's root page. + void set_root_address(uint64_t root_address) { + m_root_address = root_address; + } + + // Returns the btree's flags + uint32_t get_flags() const { + return (m_flags); + } + + // Sets the btree's flags + void set_flags(uint32_t flags) { + m_flags = flags; + } + + // PRO: Returns the record compression + uint8_t get_record_compression() const { + return (m_compression >> 4); + } + + // PRO: Sets the record compression + void set_record_compression(int algorithm) { + m_compression |= algorithm << 4; + } + + // PRO: Returns the key compression + uint8_t get_key_compression() const { + return (m_compression & 0xf); + } + + // PRO: Sets the key compression + void set_key_compression(int algorithm) { + m_compression |= algorithm & 0xf; + } + + private: + // address of the root-page + uint64_t m_root_address; + + // flags for this database + uint32_t m_flags; + + // The name of the database + uint16_t m_dbname; + + // key size used in the pages + uint16_t m_key_size; + + // key type + uint16_t m_key_type; + + // PRO: for storing key and record compression algorithm */ + uint8_t m_compression; + + // reserved + uint8_t m_reserved1; + + // the record size + uint32_t m_rec_size; + +} HAM_PACK_2; + +#include "1base/packstop.h" + +struct Context; +class LocalDatabase; +class BtreeNodeProxy; +struct PDupeEntry; +struct BtreeVisitor; + +// +// Abstract base class, overwritten by a templated version +// +class BtreeIndexTraits +{ + public: + // virtual destructor + virtual ~BtreeIndexTraits() { } + + // Compares two keys + // Returns -1, 0, +1 or higher positive values are the result of a + // successful key comparison (0 if both keys match, -1 when + // LHS < RHS key, +1 when LHS > RHS key). + virtual int compare_keys(LocalDatabase *db, ham_key_t *lhs, + ham_key_t *rhs) const = 0; + + // Returns the class name (for testing) + virtual std::string test_get_classname() const = 0; + + // Implementation of get_node_from_page() + virtual BtreeNodeProxy *get_node_from_page_impl(Page *page) const = 0; +}; + +// +// The Btree. Derived by BtreeIndexImpl, which uses template policies to +// define the btree node layout. +// +class BtreeIndex +{ + public: + enum { + // for get_node_from_page(): Page is a leaf + kLeafPage = 1, + + // for get_node_from_page(): Page is an internal node + kInternalPage = 2 + }; + + // Constructor; creates and initializes a new btree + BtreeIndex(LocalDatabase *db, PBtreeHeader *btree_header, + uint32_t flags, uint32_t key_type, uint32_t key_size); + + ~BtreeIndex() { + delete m_leaf_traits; + m_leaf_traits = 0; + delete m_internal_traits; + m_internal_traits = 0; + } + + // Returns the database pointer + LocalDatabase *get_db() { + return (m_db); + } + + // Returns the database pointer + LocalDatabase *get_db() const { + return (m_db); + } + + // Returns the internal key size + size_t get_key_size() const { + return (m_key_size); + } + + // Returns the record size + size_t get_record_size() const { + return (m_rec_size); + } + + // Returns the internal key type + uint16_t get_key_type() const { + return (m_key_type); + } + + // Returns the address of the root page + uint64_t get_root_address() const { + return (m_root_address); + } + + // Returns the btree flags + uint32_t get_flags() const { + return (m_flags); + } + + // Creates and initializes the btree + // + // This function is called after the ham_db_t structure was allocated + // and the file was opened + void create(Context *context, uint16_t key_type, uint32_t key_size, + uint32_t rec_size); + + // Opens and initializes the btree + // + // This function is called after the ham_db_t structure was allocated + // and the file was opened + void open(); + + // Sets the record compression algorithm + void set_record_compression(Context *context, int algo); + + // Returns the record compression algorithm + int get_record_compression(); + + // Sets the key compression algorithm + void set_key_compression(Context *context, int algo); + + // Returns the key compression algorithm + int get_key_compression(); + + // Lookup a key in the index (ham_db_find) + ham_status_t find(Context *context, Cursor *cursor, ham_key_t *key, + ByteArray *key_arena, ham_record_t *record, + ByteArray *record_arena, uint32_t flags); + + // Inserts (or updates) a key/record in the index (ham_db_insert) + ham_status_t insert(Context *context, Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags); + + // Erases a key/record from the index (ham_db_erase). + // If |duplicate_index| is 0 then all duplicates are erased, otherwise only + // the specified duplicate is erased. + ham_status_t erase(Context *context, Cursor *cursor, ham_key_t *key, + int duplicate_index, uint32_t flags); + + // Iterates over the whole index and calls |visitor| on every node + void visit_nodes(Context *context, BtreeVisitor &visitor, + bool visit_internal_nodes); + + // Checks the integrity of the btree (ham_db_check_integrity) + void check_integrity(Context *context, uint32_t flags); + + // Counts the keys in the btree + uint64_t count(Context *context, bool distinct); + + // Erases all records, overflow areas, extended keys etc from the index; + // used to avoid memory leaks when closing in-memory Databases and to + // clean up when deleting on-disk Databases. + void release(Context *context); + + // Compares two keys + // Returns -1, 0, +1 or higher positive values are the result of a + // successful key comparison (0 if both keys match, -1 when + // LHS < RHS key, +1 when LHS > RHS key). + int compare_keys(ham_key_t *lhs, ham_key_t *rhs) const { + return (m_leaf_traits->compare_keys(m_db, lhs, rhs)); + } + + // Returns a BtreeNodeProxy for a Page + BtreeNodeProxy *get_node_from_page(Page *page) { + if (page->get_node_proxy()) + return (page->get_node_proxy()); + + BtreeNodeProxy *proxy; + PBtreeNode *node = PBtreeNode::from_page(page); + if (node->is_leaf()) + proxy = get_leaf_node_from_page_impl(page); + else + proxy = get_internal_node_from_page_impl(page); + + page->set_node_proxy(proxy); + return (proxy); + } + + // Returns the usage metrics + static void fill_metrics(ham_env_metrics_t *metrics) { + metrics->btree_smo_split = ms_btree_smo_split; + metrics->btree_smo_merge = ms_btree_smo_merge; + metrics->extended_keys = Globals::ms_extended_keys; + metrics->extended_duptables = Globals::ms_extended_duptables; + metrics->key_bytes_before_compression + = Globals::ms_bytes_before_compression; + metrics->key_bytes_after_compression + = Globals::ms_bytes_after_compression; + } + + // Returns the btree usage statistics + BtreeStatistics *get_statistics() { + return (&m_statistics); + } + + // Returns the class name (for testing) + std::string test_get_classname() const { + return (m_leaf_traits->test_get_classname()); + } + + private: + friend class BtreeUpdateAction; + friend class BtreeCheckAction; + friend class BtreeEnumAction; + friend class BtreeEraseAction; + friend class BtreeFindAction; + friend class BtreeInsertAction; + friend class BtreeCursor; + friend struct MiscFixture; + friend struct BtreeKeyFixture; + friend struct BtreeCursorFixture; + friend struct DbFixture; + friend struct DuplicateFixture; + + // Implementation of get_node_from_page() (for leaf nodes) + BtreeNodeProxy *get_leaf_node_from_page_impl(Page *page) const { + return (m_leaf_traits->get_node_from_page_impl(page)); + } + + // Implementation of get_node_from_page() (for internal nodes) + BtreeNodeProxy *get_internal_node_from_page_impl(Page *page) const { + return (m_internal_traits->get_node_from_page_impl(page)); + } + + // Sets the address of the root page + void set_root_address(Context *context, uint64_t address) { + m_root_address = address; + flush_descriptor(context); + } + + // Flushes the PBtreeHeader to the Environment's header page + void flush_descriptor(Context *context); + + // Searches |parent| page for key |key| and returns the child + // page in |child|. + // + // |page_manager_flags| are forwarded to PageManager::fetch. + // + // if |idxptr| is a valid pointer then it will return the anchor index + // of the loaded page. + Page *find_child(Context *context, Page *parent, const ham_key_t *key, + uint32_t page_manager_flags, int *idxptr); + + // Searches a leaf node for a key. + // + // !!! + // only works with leaf nodes!! + // + // Returns the index of the key, or -1 if the key was not found, or + // another negative status code value when an unexpected error occurred. + int find_leaf(Context *context, Page *page, ham_key_t *key, uint32_t flags, + uint32_t *approx_match); + + // pointer to the database object + LocalDatabase *m_db; + + // the Traits class wrapping the template parameters (factory for + // leaf nodes) + BtreeIndexTraits *m_leaf_traits; + + // the Traits class wrapping the template parameters (factory for + // internal nodes) + BtreeIndexTraits *m_internal_traits; + + // the key_size of this btree index + uint16_t m_key_size; + + // the key_type of this btree index + uint16_t m_key_type; + + // the record size (or 0 if none was specified) + uint32_t m_rec_size; + + // the index of the PBtreeHeader in the Environment's header page + PBtreeHeader *m_btree_header; + + // the persistent flags of this btree index + uint32_t m_flags; + + // address of the root-page + uint64_t m_root_address; + + // the btree statistics + BtreeStatistics m_statistics; + + // usage metrics - number of page splits + static uint64_t ms_btree_smo_split; + + // usage metrics - number of page merges + static uint64_t ms_btree_smo_merge; + + // usage metrics - number of page shifts + static uint64_t ms_btree_smo_shift; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_INDEX_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index_factory.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index_factory.h new file mode 100644 index 0000000000..49d1ea8189 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_index_factory.h @@ -0,0 +1,445 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_INDEX_FACTORY_H +#define HAM_BTREE_INDEX_FACTORY_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3btree/btree_index.h" +#include "3btree/btree_impl_default.h" +#include "3btree/btree_impl_pax.h" +#include "3btree/btree_keys_pod.h" +#include "3btree/btree_keys_binary.h" +#include "3btree/btree_keys_varlen.h" +#include "3btree/btree_records_default.h" +#include "3btree/btree_records_inline.h" +#include "3btree/btree_records_internal.h" +#include "3btree/btree_records_duplicate.h" +#include "3btree/btree_node_proxy.h" +#include "4db/db_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// A specialied Traits class using template parameters +// +template<class NodeLayout, class Comparator> +class BtreeIndexTraitsImpl : public BtreeIndexTraits +{ + public: + // Compares two keys + // Returns -1, 0, +1 or higher positive values are the result of a + // successful key comparison (0 if both keys match, -1 when + // LHS < RHS key, +1 when LHS > RHS key). + virtual int compare_keys(LocalDatabase *db, ham_key_t *lhs, + ham_key_t *rhs) const { + Comparator cmp(db); + return (cmp(lhs->data, lhs->size, rhs->data, rhs->size)); + } + + // Returns the class name (for testing) + virtual std::string test_get_classname() const { + return (get_classname(*this)); + } + + // Implementation of get_node_from_page() + virtual BtreeNodeProxy *get_node_from_page_impl(Page *page) const { + return (new BtreeNodeProxyImpl<NodeLayout, Comparator>(page)); + } +}; + +// +// A BtreeIndexFactory creates BtreeIndexProxy objects depending on the +// Database configuration +// +struct BtreeIndexFactory +{ + static BtreeIndexTraits *create(LocalDatabase *db, uint32_t flags, + uint16_t key_type, uint16_t key_size, bool is_leaf) { + bool inline_records = (is_leaf && (flags & HAM_FORCE_RECORDS_INLINE)); + bool fixed_keys = (key_size != HAM_KEY_SIZE_UNLIMITED); + bool use_duplicates = (flags & HAM_ENABLE_DUPLICATES) != 0; + + switch (key_type) { + // 8bit unsigned integer + case HAM_TYPE_UINT8: + if (use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<uint8_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint8_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint8_t>, + DefLayout::DuplicateInlineRecordList>, + NumericCompare<uint8_t> >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint8_t>, + DefLayout::DuplicateDefaultRecordList>, + NumericCompare<uint8_t> >()); + } + else { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<uint8_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint8_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint8_t>, + PaxLayout::InlineRecordList>, + NumericCompare<uint8_t> >()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint8_t>, + PaxLayout::DefaultRecordList>, + NumericCompare<uint8_t> >()); + } + // 16bit unsigned integer + case HAM_TYPE_UINT16: + if (use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<uint16_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint16_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint16_t>, + DefLayout::DuplicateInlineRecordList>, + NumericCompare<uint16_t> >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint16_t>, + DefLayout::DuplicateDefaultRecordList>, + NumericCompare<uint16_t> >()); + } + else { + if (!is_leaf) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint16_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint16_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint16_t>, + PaxLayout::InlineRecordList>, + NumericCompare<uint16_t> >()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint16_t>, + PaxLayout::DefaultRecordList>, + NumericCompare<uint16_t> >()); + } + // 32bit unsigned integer + case HAM_TYPE_UINT32: + if (use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<uint32_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint32_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint32_t>, + DefLayout::DuplicateInlineRecordList>, + NumericCompare<uint32_t> >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint32_t>, + DefLayout::DuplicateDefaultRecordList>, + NumericCompare<uint32_t> >()); + } + else { + if (!is_leaf) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint32_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint32_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint32_t>, + PaxLayout::InlineRecordList>, + NumericCompare<uint32_t> >()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint32_t>, + PaxLayout::DefaultRecordList>, + NumericCompare<uint32_t> >()); + } + // 64bit unsigned integer + case HAM_TYPE_UINT64: + if (use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<uint64_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint64_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint64_t>, + DefLayout::DuplicateInlineRecordList>, + NumericCompare<uint64_t> >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<uint64_t>, + DefLayout::DuplicateDefaultRecordList>, + NumericCompare<uint64_t> >()); + } + else { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<uint64_t>, + PaxLayout::InternalRecordList>, + NumericCompare<uint64_t> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint64_t>, + PaxLayout::InlineRecordList>, + NumericCompare<uint64_t> >()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<uint64_t>, + PaxLayout::DefaultRecordList>, + NumericCompare<uint64_t> >()); + } + // 32bit float + case HAM_TYPE_REAL32: + if (use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<float>, + PaxLayout::InternalRecordList>, + NumericCompare<float> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<float>, + DefLayout::DuplicateInlineRecordList>, + NumericCompare<float> >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<float>, + DefLayout::DuplicateDefaultRecordList>, + NumericCompare<float> >()); + } + else { + if (!is_leaf) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<float>, + PaxLayout::InternalRecordList>, + NumericCompare<float> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<float>, + PaxLayout::InlineRecordList>, + NumericCompare<float> >()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<float>, + PaxLayout::DefaultRecordList>, + NumericCompare<float> >()); + } + // 64bit double + case HAM_TYPE_REAL64: + if (use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::PodKeyList<double>, + PaxLayout::InternalRecordList>, + NumericCompare<double> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<double>, + DefLayout::DuplicateInlineRecordList>, + NumericCompare<double> >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::PodKeyList<double>, + DefLayout::DuplicateDefaultRecordList>, + NumericCompare<double> >()); + } + else { + if (!is_leaf) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<double>, + PaxLayout::InternalRecordList>, + NumericCompare<double> >()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<double>, + PaxLayout::InlineRecordList>, + NumericCompare<double> >()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::PodKeyList<double>, + PaxLayout::DefaultRecordList>, + NumericCompare<double> >()); + } + // Callback function provided by user? + case HAM_TYPE_CUSTOM: + // Fixed keys, no duplicates + if (fixed_keys && !use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::InternalRecordList>, + CallbackCompare>()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::InlineRecordList>, + CallbackCompare>()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::DefaultRecordList>, + CallbackCompare>()); + } + // Fixed keys WITH duplicates + if (fixed_keys && use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::InternalRecordList>, + CallbackCompare >()); + if (inline_records) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::BinaryKeyList, + DefLayout::DuplicateInlineRecordList>, + CallbackCompare >()); + else + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::BinaryKeyList, + DefLayout::DuplicateDefaultRecordList>, + CallbackCompare >()); + } + // Variable keys with or without duplicates + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + PaxLayout::InternalRecordList>, + CallbackCompare >()); + if (inline_records && !use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + PaxLayout::InlineRecordList>, + CallbackCompare >()); + if (inline_records && use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + DefLayout::DuplicateInlineRecordList>, + CallbackCompare >()); + if (!inline_records && !use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + PaxLayout::DefaultRecordList>, + CallbackCompare >()); + if (!inline_records && use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + DefLayout::DuplicateDefaultRecordList>, + CallbackCompare >()); + ham_assert(!"shouldn't be here"); + // BINARY is the default: + case HAM_TYPE_BINARY: + // Fixed keys, no duplicates + if (fixed_keys && !use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::InternalRecordList>, + FixedSizeCompare>()); + if (inline_records) + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::InlineRecordList>, + FixedSizeCompare>()); + else + return (new BtreeIndexTraitsImpl + <PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::DefaultRecordList>, + FixedSizeCompare>()); + } + // fixed keys with duplicates + if (fixed_keys && use_duplicates) { + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + PaxNodeImpl<PaxLayout::BinaryKeyList, + PaxLayout::InternalRecordList>, + FixedSizeCompare >()); + if (inline_records && use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::BinaryKeyList, + DefLayout::DuplicateInlineRecordList>, + FixedSizeCompare >()); + if (!inline_records && use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<PaxLayout::BinaryKeyList, + DefLayout::DuplicateDefaultRecordList>, + FixedSizeCompare >()); + } + // variable length keys, with and without duplicates + if (!is_leaf) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + PaxLayout::InternalRecordList>, + VariableSizeCompare >()); + if (inline_records && !use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + PaxLayout::InlineRecordList>, + VariableSizeCompare >()); + if (inline_records && use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + DefLayout::DuplicateInlineRecordList>, + VariableSizeCompare >()); + if (!inline_records && !use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + PaxLayout::DefaultRecordList>, + VariableSizeCompare >()); + if (!inline_records && use_duplicates) + return (new BtreeIndexTraitsImpl< + DefaultNodeImpl<DefLayout::VariableLengthKeyList, + DefLayout::DuplicateDefaultRecordList>, + VariableSizeCompare >()); + ham_assert(!"shouldn't be here"); + default: + break; + } + + ham_assert(!"shouldn't be here"); + return (0); + } +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_INDEX_FACTORY_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_insert.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_insert.cc new file mode 100644 index 0000000000..7dac8365d7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_insert.cc @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * btree inserting + */ + +#include "0root/root.h" + +#include <string.h> +#include <algorithm> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3page_manager/page_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_stats.h" +#include "3btree/btree_node_proxy.h" +#include "3btree/btree_cursor.h" +#include "3btree/btree_update.h" +#include "4cursor/cursor.h" +#include "4db/db.h" +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace std; + +namespace hamsterdb { + +class BtreeInsertAction : public BtreeUpdateAction +{ + public: + BtreeInsertAction(BtreeIndex *btree, Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags) + : BtreeUpdateAction(btree, context, cursor + ? cursor->get_btree_cursor() + : 0, 0), + m_key(key), m_record(record), m_flags(flags) { + if (m_cursor) + m_duplicate_index = m_cursor->get_duplicate_index(); + } + + // This is the entry point for the actual insert operation + ham_status_t run() { + BtreeStatistics *stats = m_btree->get_statistics(); + + m_hints = stats->get_insert_hints(m_flags); + + ham_assert((m_hints.flags & (HAM_DUPLICATE_INSERT_BEFORE + | HAM_DUPLICATE_INSERT_AFTER + | HAM_DUPLICATE_INSERT_FIRST + | HAM_DUPLICATE_INSERT_LAST)) + ? (m_hints.flags & HAM_DUPLICATE) + : 1); + + /* + * append the key? append_or_prepend_key() will try to append or + * prepend the key; if this fails because the key is NOT the largest + * (or smallest) key in the database or because the current page is + * already full, it will remove the HINT_APPEND (or HINT_PREPEND) + * flag and call insert() + */ + ham_status_t st; + if (m_hints.leaf_page_addr + && (m_hints.flags & HAM_HINT_APPEND + || m_hints.flags & HAM_HINT_PREPEND)) + st = append_or_prepend_key(); + else + st = insert(); + + if (st == HAM_LIMITS_REACHED) + st = insert(); + + if (st) + stats->insert_failed(); + else { + if (m_hints.processed_leaf_page) + stats->insert_succeeded(m_hints.processed_leaf_page, + m_hints.processed_slot); + } + + return (st); + } + + private: + // Appends a key at the "end" of the btree, or prepends it at the + // "beginning" + ham_status_t append_or_prepend_key() { + Page *page; + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + bool force_append = false; + bool force_prepend = false; + + /* + * see if we get this btree leaf; if not, revert to regular scan + * + * As this is a speed-improvement hint re-using recent material, the page + * should still sit in the cache, or we're using old info, which should + * be discarded. + */ + page = env->page_manager()->fetch(m_context, m_hints.leaf_page_addr, + PageManager::kOnlyFromCache); + /* if the page is not in cache: do a regular insert */ + if (!page) + return (insert()); + + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + ham_assert(node->is_leaf()); + + /* + * if the page is already full OR this page is not the right-most page + * when we APPEND or the left-most node when we PREPEND + * OR the new key is not the highest key: perform a normal insert + */ + if ((m_hints.flags & HAM_HINT_APPEND && node->get_right() != 0) + || (m_hints.flags & HAM_HINT_PREPEND && node->get_left() != 0) + || node->requires_split(m_context, m_key)) + return (insert()); + + /* + * if the page is not empty: check if we append the key at the end/start + * (depending on the flags), or if it's actually inserted in the middle. + */ + if (node->get_count() != 0) { + if (m_hints.flags & HAM_HINT_APPEND) { + int cmp_hi = node->compare(m_context, m_key, node->get_count() - 1); + /* key is at the end */ + if (cmp_hi > 0) { + ham_assert(node->get_right() == 0); + force_append = true; + } + } + + if (m_hints.flags & HAM_HINT_PREPEND) { + int cmp_lo = node->compare(m_context, m_key, 0); + /* key is at the start of page */ + if (cmp_lo < 0) { + ham_assert(node->get_left() == 0); + force_prepend = true; + } + } + } + + /* OK - we're really appending/prepending the new key. */ + if (force_append || force_prepend) + return (insert_in_page(page, m_key, m_record, m_hints, + force_prepend, force_append)); + + /* otherwise reset the hints because they are no longer valid */ + m_hints.flags &= ~HAM_HINT_APPEND; + m_hints.flags &= ~HAM_HINT_PREPEND; + return (insert()); + } + + ham_status_t insert() { + // traverse the tree till a leaf is reached + Page *parent; + Page *page = traverse_tree(m_key, m_hints, &parent); + + // We've reached the leaf; it's still possible that we have to + // split the page, therefore this case has to be handled + ham_status_t st = insert_in_page(page, m_key, m_record, m_hints); + if (st == HAM_LIMITS_REACHED) { + page = split_page(page, parent, m_key, m_hints); + return (insert_in_page(page, m_key, m_record, m_hints)); + } + return (st); + } + + // the key that is inserted + ham_key_t *m_key; + + // the record that is inserted + ham_record_t *m_record; + + // flags of ham_db_insert() + uint32_t m_flags; + + // statistical hints for this operation + BtreeStatistics::InsertHints m_hints; +}; + +ham_status_t +BtreeIndex::insert(Context *context, Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + context->db = get_db(); + + BtreeInsertAction bia(this, context, cursor, key, record, flags); + return (bia.run()); +} + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_base.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_base.h new file mode 100644 index 0000000000..da5804ad04 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_base.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Base class for KeyLists + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_KEYS_BASE_H +#define HAM_BTREE_KEYS_BASE_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct BaseKeyList +{ + enum { + // This KeyList cannot reduce its capacity in order to release storage + kCanReduceCapacity = 0, + + // This KeyList uses binary search combined with linear search + kBinaryLinear, + + // This KeyList has a custom search implementation + kCustomSearch, + + // This KeyList has a custom search implementation for exact matches + // *only* + kCustomExactImplementation, + + // This KeyList uses binary search (this is the default) + kBinarySearch, + + // Specifies the search implementation: + kSearchImplementation = kBinarySearch, + + // This KeyList does NOT have a custom insert implementation + kCustomInsert = 0, + }; + + BaseKeyList() + : m_range_size(0) { + } + + // Erases the extended part of a key; nothing to do here + void erase_extended_key(Context *context, int slot) const { + } + + // Checks the integrity of this node. Throws an exception if there is a + // violation. + void check_integrity(Context *context, size_t node_count) const { + } + + // Rearranges the list + void vacuumize(size_t node_count, bool force) const { + } + + // Finds a key + template<typename Cmp> + int find(Context *, size_t node_count, const ham_key_t *key, Cmp &comparator, + int *pcmp) { + ham_assert(!"shouldn't be here"); + return (0); + } + + // Returns the threshold when switching from binary search to + // linear search. Disabled by default + size_t get_linear_search_threshold() const { + return ((size_t)-1); + } + + // Performs a linear search in a given range between |start| and + // |start + length|. Disabled by default. + template<typename Cmp> + int linear_search(size_t start, size_t length, const ham_key_t *hkey, + Cmp &comparator, int *pcmp) { + ham_assert(!"shouldn't be here"); + throw Exception(HAM_INTERNAL_ERROR); + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BtreeStatistics::update_min_max_avg(&metrics->keylist_ranges, m_range_size); + } + + // The size of the range (in bytes) + size_t m_range_size; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_KEYS_BASE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_binary.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_binary.h new file mode 100644 index 0000000000..faea959ec5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_binary.h @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Fixed length KeyList for binary data + * + * This KeyList stores binary keys of fixed length size. It is implemented + * as a plain C array of type uint8_t[]. It has fast random access, i.e. + * key #N starts at data[N * keysize]. + * + * This KeyList cannot be resized. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_KEYS_BINARY_H +#define HAM_BTREE_KEYS_BINARY_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3btree/btree_node.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_keys_base.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The template classes in this file are wrapped in a separate namespace +// to avoid naming clashes with btree_impl_default.h +// +namespace PaxLayout { + +// +// Same as the PodKeyList, but for binary arrays of fixed length +// +class BinaryKeyList : public BaseKeyList +{ + public: + enum { + // A flag whether this KeyList has sequential data + kHasSequentialData = 1, + + // A flag whether this KeyList supports the scan() call + kSupportsBlockScans = 1, + + // This KeyList uses binary search in combination with linear search + kSearchImplementation = kBinaryLinear, + }; + + // Constructor + BinaryKeyList(LocalDatabase *db) + : m_data(0) { + m_key_size = db->config().key_size; + ham_assert(m_key_size != 0); + } + + // Creates a new KeyList starting at |data|, total size is + // |range_size| (in bytes) + void create(uint8_t *data, size_t range_size) { + m_data = data; + m_range_size = range_size; + } + + // Opens an existing KeyList starting at |data| + void open(uint8_t *data, size_t range_size, size_t node_count) { + m_data = data; + m_range_size = range_size; + } + + // Calculates the required size for this range + size_t get_required_range_size(size_t node_count) const { + return (node_count * m_key_size); + } + + // Returns the actual key size including overhead + size_t get_full_key_size(const ham_key_t *key = 0) const { + return (m_key_size); + } + + // Copies a key into |dest| + void get_key(Context *context, int slot, ByteArray *arena, ham_key_t *dest, + bool deep_copy = true) const { + dest->size = (uint16_t)m_key_size; + if (likely(deep_copy == false)) { + dest->data = &m_data[slot * m_key_size]; + return; + } + + // allocate memory (if required) + if (!(dest->flags & HAM_KEY_USER_ALLOC)) { + arena->resize(dest->size); + dest->data = arena->get_ptr(); + } + + memcpy(dest->data, &m_data[slot * m_key_size], m_key_size); + } + + // Returns the threshold when switching from binary search to + // linear search + size_t get_linear_search_threshold() const { + if (m_key_size > 32) + return (-1); // disable linear search for large keys + return (128 / m_key_size); + } + + // Performs a linear search in a given range between |start| and + // |start + length| + template<typename Cmp> + int linear_search(size_t start, size_t length, const ham_key_t *key, + Cmp &comparator, int *pcmp) { + uint8_t *begin = &m_data[start * m_key_size]; + uint8_t *end = &m_data[(start + length) * m_key_size]; + uint8_t *current = begin; + + int c = start; + + while (current < end) { + /* compare it against the key */ + int cmp = comparator(key->data, key->size, current, m_key_size); + + /* found it, or moved past the key? */ + if (cmp <= 0) { + if (cmp < 0) { + if (c == 0) + *pcmp = -1; // key is < #m_data[0] + else + *pcmp = +1; // key is > #m_data[c - 1]! + return (c - 1); + } + *pcmp = 0; + return (c); + } + + current += m_key_size; + c++; + } + + /* the new key is > the last key in the page */ + *pcmp = 1; + return (start + length - 1); + } + + // Iterates all keys, calls the |visitor| on each + void scan(Context *context, ScanVisitor *visitor, uint32_t start, + size_t length) { + (*visitor)(&m_data[start * m_key_size], length); + } + + // Erases a whole slot by shifting all larger keys to the "left" + void erase(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count - 1) + memmove(&m_data[slot * m_key_size], &m_data[(slot + 1) * m_key_size], + m_key_size * (node_count - slot - 1)); + } + + // Inserts a key + template<typename Cmp> + PBtreeNode::InsertResult insert(Context *context, size_t node_count, + const ham_key_t *key, uint32_t flags, Cmp &comparator, + int slot) { + if (node_count > (size_t)slot) + memmove(&m_data[(slot + 1) * m_key_size], &m_data[slot * m_key_size], + m_key_size * (node_count - slot)); + set_key_data(slot, key->data, key->size); + return (PBtreeNode::InsertResult(0, slot)); + } + + // Returns true if the |key| no longer fits into the node + bool requires_split(size_t node_count, const ham_key_t *key) const { + return ((node_count + 1) * m_key_size >= m_range_size); + } + + // Copies |count| key from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, BinaryKeyList &dest, + size_t other_count, int dstart) { + memcpy(&dest.m_data[dstart * m_key_size], &m_data[sstart * m_key_size], + m_key_size * (node_count - sstart)); + } + + // Change the capacity; for PAX layouts this just means copying the + // data from one place to the other + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + memmove(new_data_ptr, m_data, node_count * m_key_size); + m_data = new_data_ptr; + m_range_size = new_range_size; + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseKeyList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->keylist_unused, + m_range_size - (node_count * m_key_size)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) const { + for (size_t i = 0; i < m_key_size; i++) + out << (char)m_data[slot * m_key_size + i]; + } + + // Returns the key size + size_t get_key_size(int slot) const { + return (m_key_size); + } + + // Returns the pointer to a key's data + uint8_t *get_key_data(int slot) { + return (&m_data[slot * m_key_size]); + } + + // Has support for SIMD style search? + bool has_simd_support() const { + return (false); + } + + // Returns the pointer to the key's inline data - for SIMD calculations + // Not implemented by this KeyList + uint8_t *get_simd_data() { + return (0); + } + + private: + // Returns the pointer to a key's data (const flavour) + uint8_t *get_key_data(int slot) const { + return (&m_data[slot * m_key_size]); + } + + // Overwrites a key's data. The |size| of the new data HAS + // to be identical to the "official" key size + void set_key_data(int slot, const void *ptr, size_t size) { + ham_assert(size == get_key_size(slot)); + memcpy(&m_data[slot * m_key_size], ptr, size); + } + + // The size of a single key + size_t m_key_size; + + // Pointer to the actual key data + uint8_t *m_data; +}; + +} // namespace PaxLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_KEYS_BINARY_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_pod.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_pod.h new file mode 100644 index 0000000000..1a0582da69 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_pod.h @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Fixed length KeyList for built-in data types ("POD types") + * + * This is the fastest KeyList available. It stores POD data sequentially + * in an array, i.e. PodKeyList<uint32_t> is simply a plain + * C array of type uint32_t[]. Each key has zero overhead. + * + * This KeyList cannot be resized. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_KEYS_POD_H +#define HAM_BTREE_KEYS_POD_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3btree/btree_node.h" +#include "3btree/btree_keys_base.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The template classes in this file are wrapped in a separate namespace +// to avoid naming clashes with btree_impl_default.h +// +namespace PaxLayout { + +// +// The PodKeyList provides simplified access to a list of keys where each +// key is of type T (i.e. uint32_t). +// +template<typename T> +class PodKeyList : public BaseKeyList +{ + public: + enum { + // A flag whether this KeyList has sequential data + kHasSequentialData = 1, + + // A flag whether this KeyList supports the scan() call + kSupportsBlockScans = 1, + + // This KeyList uses a custom SIMD implementation if possible, + // otherwise binary search in combination with linear search + kSearchImplementation = kBinaryLinear, + }; + + // Constructor + PodKeyList(LocalDatabase *db) + : m_data(0) { + } + + // Creates a new PodKeyList starting at |ptr|, total size is + // |range_size| (in bytes) + void create(uint8_t *data, size_t range_size) { + m_data = (T *)data; + m_range_size = range_size; + } + + // Opens an existing PodKeyList starting at |ptr| + void open(uint8_t *data, size_t range_size, size_t node_count) { + m_data = (T *)data; + m_range_size = range_size; + } + + // Returns the required size for the current set of keys + size_t get_required_range_size(size_t node_count) const { + return (node_count * sizeof(T)); + } + + // Returns the actual key size including overhead + size_t get_full_key_size(const ham_key_t *key = 0) const { + return (sizeof(T)); + } + + // Copies a key into |dest| + void get_key(Context *context, int slot, ByteArray *arena, ham_key_t *dest, + bool deep_copy = true) const { + dest->size = sizeof(T); + if (deep_copy == false) { + dest->data = &m_data[slot]; + return; + } + + // allocate memory (if required) + if (!(dest->flags & HAM_KEY_USER_ALLOC)) { + arena->resize(dest->size); + dest->data = arena->get_ptr(); + } + + memcpy(dest->data, &m_data[slot], sizeof(T)); + } + + // Returns the threshold when switching from binary search to + // linear search + size_t get_linear_search_threshold() const { + return (128 / sizeof(T)); + } + + // Performs a linear search in a given range between |start| and + // |start + length| + template<typename Cmp> + int linear_search(size_t start, size_t length, const ham_key_t *hkey, + Cmp &comparator, int *pcmp) { + T key = *(T *)hkey->data; + size_t c = start; + size_t end = start + length; + + #undef COMPARE + #define COMPARE(c) if (key <= m_data[c]) { \ + if (key < m_data[c]) { \ + if (c == 0) \ + *pcmp = -1; /* key < m_data[0] */ \ + else \ + *pcmp = +1; /* key > m_data[c - 1] */ \ + return ((c) - 1); \ + } \ + *pcmp = 0; \ + return (c); \ + } + + while (c + 8 < end) { + COMPARE(c) + COMPARE(c + 1) + COMPARE(c + 2) + COMPARE(c + 3) + COMPARE(c + 4) + COMPARE(c + 5) + COMPARE(c + 6) + COMPARE(c + 7) + c += 8; + } + + while (c < end) { + COMPARE(c) + c++; + } + + /* the new key is > the last key in the page */ + *pcmp = 1; + return (start + length - 1); + } + + // Iterates all keys, calls the |visitor| on each + void scan(Context *context, ScanVisitor *visitor, uint32_t start, + size_t length) { + (*visitor)(&m_data[start], length); + } + + // Erases a whole slot by shifting all larger keys to the "left" + void erase(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count - 1) + memmove(&m_data[slot], &m_data[slot + 1], + sizeof(T) * (node_count - slot - 1)); + } + + // Inserts a key + template<typename Cmp> + PBtreeNode::InsertResult insert(Context *context, size_t node_count, + const ham_key_t *key, uint32_t flags, Cmp &comparator, + int slot) { + if (node_count > (size_t)slot) + memmove(&m_data[slot + 1], &m_data[slot], + sizeof(T) * (node_count - slot)); + set_key_data(slot, key->data, key->size); + return (PBtreeNode::InsertResult(0, slot)); + } + + // Copies |count| key from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, PodKeyList<T> &dest, + size_t other_count, int dstart) { + memcpy(&dest.m_data[dstart], &m_data[sstart], + sizeof(T) * (node_count - sstart)); + } + + // Returns true if the |key| no longer fits into the node + bool requires_split(size_t node_count, const ham_key_t *key) const { + return ((node_count + 1) * sizeof(T) >= m_range_size); + } + + // Change the range size; just copy the data from one place to the other + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + memmove(new_data_ptr, m_data, node_count * sizeof(T)); + m_data = (T *)new_data_ptr; + m_range_size = new_range_size; + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseKeyList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->keylist_unused, + m_range_size - (node_count * sizeof(T))); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) const { + out << m_data[slot]; + } + + // Returns the size of a key + size_t get_key_size(int slot) const { + return (sizeof(T)); + } + + // Returns a pointer to the key's data + uint8_t *get_key_data(int slot) { + return ((uint8_t *)&m_data[slot]); + } + + private: + // Returns a pointer to the key's data (const flavour) + uint8_t *get_key_data(int slot) const { + return ((uint8_t *)&m_data[slot]); + } + + // Overwrites an existing key; the |size| of the new data HAS to be + // identical with the key size specified when the database was created! + void set_key_data(int slot, const void *ptr, size_t size) { + ham_assert(size == sizeof(T)); + m_data[slot] = *(T *)ptr; + } + + // The actual array of T's + T *m_data; +}; + +} // namespace PaxLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_KEYS_POD_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_varlen.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_varlen.h new file mode 100644 index 0000000000..5f85676c56 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_keys_varlen.h @@ -0,0 +1,533 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Variable length KeyList + * + * Each key is stored in a "chunk", and the chunks are managed by an upfront + * index which contains offset and size of each chunk. The index also keeps + * track of deleted chunks. + * + * The actual chunk data contains the key's data (which can be a 64bit blob + * ID if the key is too big). + * + * If the key is too big (exceeds |m_extkey_threshold|) then it's offloaded + * to an external blob, and only the 64bit record id of this blob is stored + * in the node. These "extended keys" are cached; the cache's lifetime is + * coupled to the lifetime of the node. + * + * To avoid expensive memcpy-operations, erasing a key only affects this + * upfront index: the relevant slot is moved to a "freelist". This freelist + * contains the same meta information as the index table. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_KEYS_VARLEN_H +#define HAM_BTREE_KEYS_VARLEN_H + +#include "0root/root.h" + +#include <algorithm> +#include <iostream> +#include <vector> +#include <map> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "1base/scoped_ptr.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_node.h" +#include "3btree/btree_index.h" +#include "3btree/upfront_index.h" +#include "3btree/btree_keys_base.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +namespace DefLayout { + +// +// Variable length keys +// +// This KeyList uses an UpfrontIndex to manage the variable length data +// chunks. The UpfrontIndex knows the sizes of the chunks, and therefore +// the VariableLengthKeyList does *not* store additional size information. +// +// The format of a single key is: +// |Flags|Data...| +// where Flags are 8 bit. +// +// The key size (as specified by the user when inserting the key) therefore +// is UpfrontIndex::get_chunk_size() - 1. +// +class VariableLengthKeyList : public BaseKeyList +{ + // for caching external keys + typedef std::map<uint64_t, ByteArray> ExtKeyCache; + + public: + enum { + // A flag whether this KeyList has sequential data + kHasSequentialData = 0, + + // A flag whether this KeyList supports the scan() call + kSupportsBlockScans = 0, + + // This KeyList can reduce its capacity in order to release storage + kCanReduceCapacity = 1, + + // This KeyList uses binary search + kSearchImplementation = kBinarySearch, + }; + + // Constructor + VariableLengthKeyList(LocalDatabase *db) + : m_db(db), m_index(db), m_data(0) { + size_t page_size = db->lenv()->config().page_size_bytes; + if (Globals::ms_extended_threshold) + m_extkey_threshold = Globals::ms_extended_threshold; + else { + if (page_size == 1024) + m_extkey_threshold = 64; + else if (page_size <= 1024 * 8) + m_extkey_threshold = 128; + else { + // UpfrontIndex's chunk size has 8 bit (max 255), and reserve + // a few bytes for metadata (flags) + m_extkey_threshold = 250; + } + } + } + + // Creates a new KeyList starting at |ptr|, total size is + // |range_size| (in bytes) + void create(uint8_t *data, size_t range_size) { + m_data = data; + m_range_size = range_size; + m_index.create(m_data, range_size, range_size / get_full_key_size()); + } + + // Opens an existing KeyList + void open(uint8_t *data, size_t range_size, size_t node_count) { + m_data = data; + m_range_size = range_size; + m_index.open(m_data, range_size); + } + + // Calculates the required size for a range + size_t get_required_range_size(size_t node_count) const { + return (m_index.get_required_range_size(node_count)); + } + + // Returns the actual key size including overhead. This is an estimate + // since we don't know how large the keys will be + size_t get_full_key_size(const ham_key_t *key = 0) const { + if (!key) + return (24 + m_index.get_full_index_size() + 1); + // always make sure to have enough space for an extkey id + if (key->size < 8 || key->size > m_extkey_threshold) + return (sizeof(uint64_t) + m_index.get_full_index_size() + 1); + return (key->size + m_index.get_full_index_size() + 1); + } + + // Copies a key into |dest| + void get_key(Context *context, int slot, ByteArray *arena, ham_key_t *dest, + bool deep_copy = true) { + ham_key_t tmp; + uint32_t offset = m_index.get_chunk_offset(slot); + uint8_t *p = m_index.get_chunk_data_by_offset(offset); + + if (unlikely(*p & BtreeKey::kExtendedKey)) { + memset(&tmp, 0, sizeof(tmp)); + get_extended_key(context, get_extended_blob_id(slot), &tmp); + } + else { + tmp.size = get_key_size(slot); + tmp.data = p + 1; + } + + dest->size = tmp.size; + + if (likely(deep_copy == false)) { + dest->data = tmp.data; + return; + } + + // allocate memory (if required) + if (!(dest->flags & HAM_KEY_USER_ALLOC)) { + arena->resize(tmp.size); + dest->data = arena->get_ptr(); + } + memcpy(dest->data, tmp.data, tmp.size); + } + + // Iterates all keys, calls the |visitor| on each. Not supported by + // this KeyList implementation. For variable length keys, the caller + // must iterate over all keys. The |scan()| interface is only implemented + // for PAX style layouts. + void scan(Context *context, ScanVisitor *visitor, size_t node_count, + uint32_t start) { + ham_assert(!"shouldn't be here"); + throw Exception(HAM_INTERNAL_ERROR); + } + + // Erases a key's payload. Does NOT remove the chunk from the UpfrontIndex + // (see |erase()|). + void erase_extended_key(Context *context, int slot) { + uint8_t flags = get_key_flags(slot); + if (flags & BtreeKey::kExtendedKey) { + // delete the extended key from the cache + erase_extended_key(context, get_extended_blob_id(slot)); + // and transform into a key which is non-extended and occupies + // the same space as before, when it was extended + set_key_flags(slot, flags & (~BtreeKey::kExtendedKey)); + set_key_size(slot, sizeof(uint64_t)); + } + } + + // Erases a key, including extended blobs + void erase(Context *context, size_t node_count, int slot) { + erase_extended_key(context, slot); + m_index.erase(node_count, slot); + } + + // Inserts the |key| at the position identified by |slot|. + // This method cannot fail; there MUST be sufficient free space in the + // node (otherwise the caller would have split the node). + template<typename Cmp> + PBtreeNode::InsertResult insert(Context *context, size_t node_count, + const ham_key_t *key, uint32_t flags, + Cmp &comparator, int slot) { + m_index.insert(node_count, slot); + + // now there's one additional slot + node_count++; + + uint32_t key_flags = 0; + + // When inserting the data: always add 1 byte for key flags + if (key->size <= m_extkey_threshold + && m_index.can_allocate_space(node_count, key->size + 1)) { + uint32_t offset = m_index.allocate_space(node_count, slot, + key->size + 1); + uint8_t *p = m_index.get_chunk_data_by_offset(offset); + *p = key_flags; + memcpy(p + 1, key->data, key->size); // and data + } + else { + uint64_t blob_id = add_extended_key(context, key); + m_index.allocate_space(node_count, slot, 8 + 1); + set_extended_blob_id(slot, blob_id); + set_key_flags(slot, key_flags | BtreeKey::kExtendedKey); + } + + return (PBtreeNode::InsertResult(0, slot)); + } + + // Returns true if the |key| no longer fits into the node and a split + // is required. Makes sure that there is ALWAYS enough headroom + // for an extended key! + // + // If there's no key specified then always assume the worst case and + // pretend that the key has the maximum length + bool requires_split(size_t node_count, const ham_key_t *key) { + size_t required; + if (key) { + required = key->size + 1; + // add 1 byte for flags + if (key->size > m_extkey_threshold || key->size < 8 + 1) + required = 8 + 1; + } + else + required = m_extkey_threshold + 1; + return (m_index.requires_split(node_count, required)); + } + + // Copies |count| key from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, + VariableLengthKeyList &dest, size_t other_node_count, + int dstart) { + size_t to_copy = node_count - sstart; + ham_assert(to_copy > 0); + + // make sure that the other node has sufficient capacity in its + // UpfrontIndex + dest.m_index.change_range_size(other_node_count, 0, 0, + m_index.get_capacity()); + + for (size_t i = 0; i < to_copy; i++) { + size_t size = get_key_size(sstart + i); + + uint8_t *p = m_index.get_chunk_data_by_offset( + m_index.get_chunk_offset(sstart + i)); + uint8_t flags = *p; + uint8_t *data = p + 1; + + dest.m_index.insert(other_node_count + i, dstart + i); + // Add 1 byte for key flags + uint32_t offset = dest.m_index.allocate_space(other_node_count + i + 1, + dstart + i, size + 1); + p = dest.m_index.get_chunk_data_by_offset(offset); + *p = flags; // sets flags + memcpy(p + 1, data, size); // and data + } + + // A lot of keys will be invalidated after copying, therefore make + // sure that the next_offset is recalculated when it's required + m_index.invalidate_next_offset(); + } + + // Checks the integrity of this node. Throws an exception if there is a + // violation. + void check_integrity(Context *context, size_t node_count) const { + ByteArray arena; + + // verify that the offsets and sizes are not overlapping + m_index.check_integrity(node_count); + + // make sure that extkeys are handled correctly + for (size_t i = 0; i < node_count; i++) { + if (get_key_size(i) > m_extkey_threshold + && !(get_key_flags(i) & BtreeKey::kExtendedKey)) { + ham_log(("key size %d, but key is not extended", get_key_size(i))); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + + if (get_key_flags(i) & BtreeKey::kExtendedKey) { + uint64_t blobid = get_extended_blob_id(i); + if (!blobid) { + ham_log(("integrity check failed: item %u " + "is extended, but has no blob", i)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + + // make sure that the extended blob can be loaded + ham_record_t record = {0}; + m_db->lenv()->blob_manager()->read(context, blobid, + &record, 0, &arena); + + // compare it to the cached key (if there is one) + if (m_extkey_cache) { + ExtKeyCache::iterator it = m_extkey_cache->find(blobid); + if (it != m_extkey_cache->end()) { + if (record.size != it->second.get_size()) { + ham_log(("Cached extended key differs from real key")); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + if (memcmp(record.data, it->second.get_ptr(), record.size)) { + ham_log(("Cached extended key differs from real key")); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + } + } + } + } + + // Rearranges the list + void vacuumize(size_t node_count, bool force) { + if (force) + m_index.increase_vacuumize_counter(100); + m_index.maybe_vacuumize(node_count); + } + + // Change the range size; the capacity will be adjusted, the data is + // copied as necessary + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + // no capacity given? then try to find a good default one + if (capacity_hint == 0) { + capacity_hint = (new_range_size - m_index.get_next_offset(node_count) + - get_full_key_size()) / m_index.get_full_index_size(); + if (capacity_hint <= node_count) + capacity_hint = node_count + 1; + } + + // if there's not enough space for the new capacity then try to reduce + // the capacity + if (m_index.get_next_offset(node_count) + get_full_key_size(0) + + capacity_hint * m_index.get_full_index_size() + + UpfrontIndex::kPayloadOffset + > new_range_size) + capacity_hint = node_count + 1; + + m_index.change_range_size(node_count, new_data_ptr, new_range_size, + capacity_hint); + m_data = new_data_ptr; + m_range_size = new_range_size; + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseKeyList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->keylist_index, + (uint32_t)(m_index.get_capacity() + * m_index.get_full_index_size())); + BtreeStatistics::update_min_max_avg(&metrics->keylist_unused, + m_range_size + - (uint32_t)m_index.get_required_range_size(node_count)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) { + ham_key_t tmp = {0}; + if (get_key_flags(slot) & BtreeKey::kExtendedKey) { + get_extended_key(context, get_extended_blob_id(slot), &tmp); + } + else { + tmp.size = get_key_size(slot); + tmp.data = get_key_data(slot); + } + out << (const char *)tmp.data; + } + + // Returns the pointer to a key's inline data (const flavour) + uint8_t *get_key_data(int slot) const { + uint32_t offset = m_index.get_chunk_offset(slot); + return (m_index.get_chunk_data_by_offset(offset) + 1); + } + + // Returns the size of a key + size_t get_key_size(int slot) const { + return (m_index.get_chunk_size(slot) - 1); + } + + private: + // Returns the flags of a key. Flags are defined in btree_flags.h + uint8_t get_key_flags(int slot) const { + uint32_t offset = m_index.get_chunk_offset(slot); + return (*m_index.get_chunk_data_by_offset(offset)); + } + + // Sets the flags of a key. Flags are defined in btree_flags.h + void set_key_flags(int slot, uint8_t flags) { + uint32_t offset = m_index.get_chunk_offset(slot); + *m_index.get_chunk_data_by_offset(offset) = flags; + } + + // Overwrites the (inline) data of the key + void set_key_data(int slot, const void *ptr, size_t size) { + ham_assert(m_index.get_chunk_size(slot) >= size); + set_key_size(slot, (uint16_t)size); + memcpy(get_key_data(slot), ptr, size); + } + + // Sets the size of a key + void set_key_size(int slot, size_t size) { + ham_assert(size + 1 <= m_index.get_chunk_size(slot)); + m_index.set_chunk_size(slot, size + 1); + } + + // Returns the record address of an extended key overflow area + uint64_t get_extended_blob_id(int slot) const { + return (*(uint64_t *)get_key_data(slot)); + } + + // Sets the record address of an extended key overflow area + void set_extended_blob_id(int slot, uint64_t blobid) { + *(uint64_t *)get_key_data(slot) = blobid; + } + + // Erases an extended key from disk and from the cache + void erase_extended_key(Context *context, uint64_t blobid) { + m_db->lenv()->blob_manager()->erase(context, blobid); + if (m_extkey_cache) { + ExtKeyCache::iterator it = m_extkey_cache->find(blobid); + if (it != m_extkey_cache->end()) + m_extkey_cache->erase(it); + } + } + + // Retrieves the extended key at |blobid| and stores it in |key|; will + // use the cache. + void get_extended_key(Context *context, uint64_t blob_id, ham_key_t *key) { + if (!m_extkey_cache) + m_extkey_cache.reset(new ExtKeyCache()); + else { + ExtKeyCache::iterator it = m_extkey_cache->find(blob_id); + if (it != m_extkey_cache->end()) { + key->size = it->second.get_size(); + key->data = it->second.get_ptr(); + return; + } + } + + ByteArray arena; + ham_record_t record = {0}; + m_db->lenv()->blob_manager()->read(context, blob_id, &record, + HAM_FORCE_DEEP_COPY, &arena); + (*m_extkey_cache)[blob_id] = arena; + arena.disown(); + key->data = record.data; + key->size = record.size; + } + + // Allocates an extended key and stores it in the cache + uint64_t add_extended_key(Context *context, const ham_key_t *key) { + if (!m_extkey_cache) + m_extkey_cache.reset(new ExtKeyCache()); + + ham_record_t rec = {0}; + rec.data = key->data; + rec.size = key->size; + + uint64_t blob_id = m_db->lenv()->blob_manager()->allocate( + context, &rec, 0); + ham_assert(blob_id != 0); + ham_assert(m_extkey_cache->find(blob_id) == m_extkey_cache->end()); + + ByteArray arena; + arena.resize(key->size); + memcpy(arena.get_ptr(), key->data, key->size); + (*m_extkey_cache)[blob_id] = arena; + arena.disown(); + + // increment counter (for statistics) + Globals::ms_extended_keys++; + + return (blob_id); + } + + // The database + LocalDatabase *m_db; + + // The index for managing the variable-length chunks + UpfrontIndex m_index; + + // Pointer to the data of the node + uint8_t *m_data; + + // Cache for extended keys + ScopedPtr<ExtKeyCache> m_extkey_cache; + + // Threshold for extended keys; if key size is > threshold then the + // key is moved to a blob + size_t m_extkey_threshold; +}; + +} // namespace DefLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_KEYS_VARLEN_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_node.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_node.h new file mode 100644 index 0000000000..854e68e1a5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_node.h @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_NODE_H +#define HAM_BTREE_NODE_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "2page/page.h" +#include "3btree/btree_flags.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class PBtreeKeyDefault; + +#include "1base/packstart.h" + +/* + * A BtreeNode structure spans the persistent part of a Page + * + * This structure is directly written to/read from the file. + */ +HAM_PACK_0 struct HAM_PACK_1 PBtreeNode +{ + public: + // Result of the insert() operation + struct InsertResult { + InsertResult(ham_status_t _status = 0, int _slot = 0) + : status(_status), slot(_slot) { + } + + // hamsterdb status code + ham_status_t status; + + // the slot of the new (or existing) key + int slot; + }; + + enum { + // insert key at the beginning of the page + kInsertPrepend = 1, + + // append key to the end of the page + kInsertAppend = 2, + }; + + enum { + // node is a leaf + kLeafNode = 1 + }; + + // Returns a PBtreeNode from a Page + static PBtreeNode *from_page(Page *page) { + return ((PBtreeNode *)page->get_payload()); + } + + // Returns the offset (in bytes) of the member |m_data| + static uint32_t get_entry_offset() { + return (sizeof(PBtreeNode) - 1); + } + + // Returns the flags of the btree node (|kLeafNode|) + uint32_t get_flags() const { + return (m_flags); + } + + // Sets the flags of the btree node (|kLeafNode|) + void set_flags(uint32_t flags) { + m_flags = flags; + } + + // Returns the number of entries in a BtreeNode + uint32_t get_count() const { + return (m_count); + } + + // Sets the number of entries in a BtreeNode + void set_count(uint32_t count) { + m_count = count; + } + + // Returns the address of the left sibling of this node + uint64_t get_left() const { + return (m_left); + } + + // Sets the address of the left sibling of this node + void set_left(uint64_t left) { + m_left = left; + } + + // Returns the address of the right sibling of this node + uint64_t get_right() const { + return (m_right); + } + + // Sets the address of the right sibling of this node + void set_right(uint64_t right) { + m_right = right; + } + + // Returns the ptr_down of this node + uint64_t get_ptr_down() const { + return (m_ptr_down); + } + + // Returns true if this btree node is a leaf node + bool is_leaf() const { + return (m_flags & kLeafNode); + } + + // Sets the ptr_down of this node + void set_ptr_down(uint64_t ptr_down) { + m_ptr_down = ptr_down; + } + + // Returns a pointer to the key data + uint8_t *get_data() { + return (&m_data[0]); + } + + const uint8_t *get_data() const { + return (&m_data[0]); + } + + private: + // flags of this node + uint32_t m_flags; + + // number of used entries in the node + uint32_t m_count; + + // address of left sibling + uint64_t m_left; + + // address of right sibling + uint64_t m_right; + + // address of child node whose items are smaller than all items + // in this node + uint64_t m_ptr_down; + + // the entries of this node + uint8_t m_data[1]; + +} HAM_PACK_2; + +#include "1base/packstop.h" + +} // namespace hamsterdb + +#endif /* HAM_BTREE_NODE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_node_proxy.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_node_proxy.h new file mode 100644 index 0000000000..110bd05f08 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_node_proxy.h @@ -0,0 +1,609 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_NODE_PROXY_H +#define HAM_BTREE_NODE_PROXY_H + +#include "0root/root.h" + +#include <set> +#include <string.h> +#include <iostream> +#include <sstream> +#include <fstream> + +// Always verify that a file of level N does not include headers > N! +#include "1base/abi.h" +#include "1base/dynamic_array.h" +#include "1base/error.h" +#include "2page/page.h" +#include "3btree/btree_node.h" +#include "3blob_manager/blob_manager.h" +#include "4env/env_local.h" +#include "4db/db_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +struct ScanVisitor; + +// +// A BtreeNodeProxy wraps a PBtreeNode structure and defines the actual +// format of the btree payload. +// +// The BtreeNodeProxy class provides access to the actual Btree nodes. The +// layout of those nodes depends heavily on the database configuration, +// and is implemented by template classes (btree_impl_default.h, +// btree_impl_pax.h.). +// +class BtreeNodeProxy +{ + public: + // Constructor + BtreeNodeProxy(Page *page) + : m_page(page) { + } + + // Destructor + virtual ~BtreeNodeProxy() { + } + + // Returns the flags of the btree node (|kLeafNode|) + uint32_t get_flags() const { + return (PBtreeNode::from_page(m_page)->get_flags()); + } + + // Sets the flags of the btree node (|kLeafNode|) + void set_flags(uint32_t flags) { + PBtreeNode::from_page(m_page)->set_flags(flags); + } + + // Returns the number of entries in the BtreeNode + size_t get_count() const { + return (PBtreeNode::from_page(m_page)->get_count()); + } + + // Sets the number of entries in the BtreeNode + void set_count(size_t count) { + PBtreeNode::from_page(m_page)->set_count((uint32_t)count); + } + + // Returns true if this btree node is a leaf node + bool is_leaf() const { + return (PBtreeNode::from_page(m_page)->is_leaf()); + } + + // Returns the address of the left sibling of this node + uint64_t get_left() const { + return (PBtreeNode::from_page(m_page)->get_left()); + } + + // Sets the address of the left sibling of this node + void set_left(uint64_t address) { + PBtreeNode::from_page(m_page)->set_left(address); + } + + // Returns the address of the right sibling of this node + uint64_t get_right() const { + return (PBtreeNode::from_page(m_page)->get_right()); + } + + // Sets the address of the right sibling of this node + void set_right(uint64_t address) { + PBtreeNode::from_page(m_page)->set_right(address); + } + + // Returns the ptr_down of this node + uint64_t get_ptr_down() const { + return (PBtreeNode::from_page(m_page)->get_ptr_down()); + } + + // Sets the ptr_down of this node + void set_ptr_down(uint64_t address) { + PBtreeNode::from_page(m_page)->set_ptr_down(address); + } + + // Returns the page pointer - const version + const Page *get_page() const { + return (m_page); + } + + // Returns the page pointer + Page *get_page() { + return (m_page); + } + + // Returns the estimated capacity of this node + virtual size_t estimate_capacity() const = 0; + + // Checks the integrity of the node. Throws an exception if it is + // not. Called by ham_db_check_integrity(). + virtual void check_integrity(Context *context) const = 0; + + // Iterates all keys, calls the |visitor| on each + virtual void scan(Context *context, ScanVisitor *visitor, + size_t start, bool distinct) = 0; + + // Compares the two keys. Returns 0 if both are equal, otherwise -1 (if + // |lhs| is greater) or +1 (if |rhs| is greater). + virtual int compare(const ham_key_t *lhs, const ham_key_t *rhs) const = 0; + + // Compares a public key and an internal key + virtual int compare(Context *context, const ham_key_t *lhs, int rhs) = 0; + + // Returns true if the public key (|lhs|) and an internal key (slot + // |rhs|) are equal + virtual bool equals(Context *context, const ham_key_t *lhs, int rhs) = 0; + + // Searches the node for the |key|, and returns the slot of this key. + // If |record_id| is not null then it will store the result of the last + // compare operation. + // If |pcmp| is not null then it will store the result of the last + // compare operation. + virtual int find_child(Context *context, ham_key_t *key, + uint64_t *record_id = 0, int *pcmp = 0) = 0; + + // Searches the node for the |key|, but will always return -1 if + // an exact match was not found + virtual int find_exact(Context *context, ham_key_t *key) = 0; + + // Returns the full key at the |slot|. Also resolves extended keys + // and respects HAM_KEY_USER_ALLOC in dest->flags. + virtual void get_key(Context *context, int slot, ByteArray *arena, + ham_key_t *dest) = 0; + + // Returns the number of records of a key at the given |slot|. This is + // either 1 or higher, but only if duplicate keys exist. + virtual int get_record_count(Context *context, int slot) = 0; + + // Returns the record size of a key or one of its duplicates. + virtual uint64_t get_record_size(Context *context, int slot, + int duplicate_index) = 0; + + // Returns the record id of the key at the given |slot| + // Only for internal nodes! + virtual uint64_t get_record_id(Context *context, int slot) const = 0; + + // Sets the record id of the key at the given |slot| + // Only for internal nodes! + virtual void set_record_id(Context *context, int slot, uint64_t id) = 0; + + // Returns the full record and stores it in |dest|. The record is identified + // by |slot| and |duplicate_index|. TINY and SMALL records are handled + // correctly, as well as HAM_DIRECT_ACCESS. + virtual void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, + int duplicate_index = 0) = 0; + + // High-level function to set a new record + // + // flags can be + // - HAM_OVERWRITE + // - HAM_DUPLICATE* + // + // a previously existing blob will be deleted if necessary + virtual void set_record(Context *context, int slot, ham_record_t *record, + int duplicate_index, uint32_t flags, + uint32_t *new_duplicate_index) = 0; + + // Removes the record (or the duplicate of it, if |duplicate_index| is > 0). + // If |all_duplicates| is set then all duplicates of this key are deleted. + // |has_duplicates_left| will return true if there are more duplicates left + // after the current one was deleted. + virtual void erase_record(Context *context, int slot, int duplicate_index, + bool all_duplicates, bool *has_duplicates_left) = 0; + + // High level function to remove an existing entry + virtual void erase(Context *context, int slot) = 0; + + // Erases all extended keys, overflow areas and records that are + // linked from this page; usually called when the Database is deleted + // or an In-Memory Database is freed + virtual void remove_all_entries(Context *context) = 0; + + // High level function to insert a new key. Only inserts the key. The + // actual record is then updated with |set_record|. + virtual PBtreeNode::InsertResult insert(Context *context, ham_key_t *key, + uint32_t flags) = 0; + + // Returns true if a node requires a split to insert a new |key| + virtual bool requires_split(Context *context, const ham_key_t *key = 0) = 0; + + // Returns true if a node requires a merge or a shift + virtual bool requires_merge() const = 0; + + // Splits a page and moves all elements at a position >= |pivot| + // to the |other| page. If the node is a leaf node then the pivot element + // is also copied, otherwise it is not because it will be propagated + // to the parent node instead (by the caller). + virtual void split(Context *context, BtreeNodeProxy *other, int pivot) = 0; + + // Merges all keys from the |other| node to this node + virtual void merge_from(Context *context, BtreeNodeProxy *other) = 0; + + // Fills the btree_metrics structure + virtual void fill_metrics(btree_metrics_t *metrics) = 0; + + // Prints the node to stdout. Only for testing and debugging! + virtual void print(Context *context, size_t node_count = 0) = 0; + + // Returns the class name. Only for testing! Uses the functions exported + // by abi.h, which are only available on assorted platforms. Other + // platforms will return empty strings. + virtual std::string test_get_classname() const = 0; + + protected: + Page *m_page; +}; + +// +// A comparator which uses a user-supplied callback function (installed +// with |ham_db_set_compare_func|) to compare two keys +// +struct CallbackCompare +{ + CallbackCompare(LocalDatabase *db) + : m_db(db) { + } + + int operator()(const void *lhs_data, uint32_t lhs_size, + const void *rhs_data, uint32_t rhs_size) const { + return (m_db->compare_func()((::ham_db_t *)m_db, (uint8_t *)lhs_data, + lhs_size, (uint8_t *)rhs_data, rhs_size)); + } + + LocalDatabase *m_db; +}; + +// +// A comparator for numeric keys. +// The actual type for the key is supplied with a template parameter. +// This has to be a POD type with support for operators < and >. +// +template<typename T> +struct NumericCompare +{ + NumericCompare(LocalDatabase *) { + } + + int operator()(const void *lhs_data, uint32_t lhs_size, + const void *rhs_data, uint32_t rhs_size) const { + ham_assert(lhs_size == rhs_size); + ham_assert(lhs_size == sizeof(T)); + T l = *(T *)lhs_data; + T r = *(T *)rhs_data; + return (l < r ? -1 : (l > r ? +1 : 0)); + } +}; + +// +// The default comparator for two keys, implemented with memcmp(3). +// Both keys have the same size! +// +struct FixedSizeCompare +{ + FixedSizeCompare(LocalDatabase *) { + } + + int operator()(const void *lhs_data, uint32_t lhs_size, + const void *rhs_data, uint32_t rhs_size) const { + ham_assert(lhs_size == rhs_size); + return (::memcmp(lhs_data, rhs_data, lhs_size)); + } +}; + +// +// The default comparator for two keys, implemented with memcmp(3). +// Both keys can have different sizes! shorter strings are treated as +// "greater" +// +struct VariableSizeCompare +{ + VariableSizeCompare(LocalDatabase *) { + } + + int operator()(const void *lhs_data, uint32_t lhs_size, + const void *rhs_data, uint32_t rhs_size) const { + if (lhs_size < rhs_size) { + int m = ::memcmp(lhs_data, rhs_data, lhs_size); + return (m == 0 ? -1 : m); + } + if (rhs_size < lhs_size) { + int m = ::memcmp(lhs_data, rhs_data, rhs_size); + return (m == 0 ? +1 : m); + } + return (::memcmp(lhs_data, rhs_data, lhs_size)); + } +}; + +// +// An implementation of the BtreeNodeProxy interface declared above. +// Its actual memory implementation of the btree keys/records is delegated +// to a template parameter |NodeImpl|, and the key comparisons are +// delegated to |Comparator|. +// +template<class NodeImpl, class Comparator> +class BtreeNodeProxyImpl : public BtreeNodeProxy +{ + typedef BtreeNodeProxyImpl<NodeImpl, Comparator> ClassType; + + public: + // Constructor + BtreeNodeProxyImpl(Page *page) + : BtreeNodeProxy(page), m_impl(page) { + } + + // Returns the estimated capacity of this node + virtual size_t estimate_capacity() const { + return (m_impl.estimate_capacity()); + } + + // Checks the integrity of the node + virtual void check_integrity(Context *context) const { + m_impl.check_integrity(context); + } + + // Iterates all keys, calls the |visitor| on each + virtual void scan(Context *context, ScanVisitor *visitor, + size_t start, bool distinct) { + m_impl.scan(context, visitor, start, distinct); + } + + // Compares two internal keys using the supplied comparator + virtual int compare(const ham_key_t *lhs, const ham_key_t *rhs) const { + Comparator cmp(m_page->get_db()); + return (cmp(lhs->data, lhs->size, rhs->data, rhs->size)); + } + + // Compares a public key and an internal key + virtual int compare(Context *context, const ham_key_t *lhs, int rhs) { + Comparator cmp(m_page->get_db()); + return (m_impl.compare(context, lhs, rhs, cmp)); + } + + // Returns true if the public key and an internal key are equal + virtual bool equals(Context *context, const ham_key_t *lhs, int rhs) { + return (0 == compare(context, lhs, rhs)); + } + + // Searches the node for the key and returns the slot of this key. + // If |pcmp| is not null then it will store the result of the last + // compare operation. + virtual int find_child(Context *context, ham_key_t *key, + uint64_t *precord_id = 0, int *pcmp = 0) { + int dummy; + if (get_count() == 0) { + if (pcmp) + *pcmp = 1; + if (precord_id) + *precord_id = get_ptr_down(); + return (-1); + } + Comparator cmp(m_page->get_db()); + return (m_impl.find_child(context, key, cmp, + precord_id ? precord_id : 0, + pcmp ? pcmp : &dummy)); + } + + // Searches the node for the |key|, but will always return -1 if + // an exact match was not found + virtual int find_exact(Context *context, ham_key_t *key) { + if (get_count() == 0) + return (-1); + Comparator cmp(m_page->get_db()); + return (m_impl.find_exact(context, key, cmp)); + } + + // Returns the full key at the |slot|. Also resolves extended keys + // and respects HAM_KEY_USER_ALLOC in dest->flags. + virtual void get_key(Context *context, int slot, ByteArray *arena, + ham_key_t *dest) { + m_impl.get_key(context, slot, arena, dest); + } + + // Returns the number of records of a key at the given |slot| + virtual int get_record_count(Context *context, int slot) { + ham_assert(slot < (int)get_count()); + return (m_impl.get_record_count(context, slot)); + } + + // Returns the full record and stores it in |dest|. The record is identified + // by |slot| and |duplicate_index|. TINY and SMALL records are handled + // correctly, as well as HAM_DIRECT_ACCESS. + virtual void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, + int duplicate_index = 0) { + ham_assert(slot < (int)get_count()); + m_impl.get_record(context, slot, arena, record, flags, duplicate_index); + } + + virtual void set_record(Context *context, int slot, ham_record_t *record, + int duplicate_index, uint32_t flags, + uint32_t *new_duplicate_index) { + m_impl.set_record(context, slot, record, duplicate_index, flags, + new_duplicate_index); + } + + // Returns the record size of a key or one of its duplicates + virtual uint64_t get_record_size(Context *context, int slot, + int duplicate_index) { + ham_assert(slot < (int)get_count()); + return (m_impl.get_record_size(context, slot, duplicate_index)); + } + + // Returns the record id of the key at the given |slot| + // Only for internal nodes! + virtual uint64_t get_record_id(Context *context, int slot) const { + ham_assert(slot < (int)get_count()); + return (m_impl.get_record_id(context, slot)); + } + + // Sets the record id of the key at the given |slot| + // Only for internal nodes! + virtual void set_record_id(Context *context, int slot, uint64_t id) { + return (m_impl.set_record_id(context, slot, id)); + } + + // High level function to remove an existing entry. Will call + // |erase_extended_key| to clean up (a potential) extended key, + // and |erase_record| on each record that is associated with the key. + virtual void erase(Context *context, int slot) { + ham_assert(slot < (int)get_count()); + m_impl.erase(context, slot); + set_count(get_count() - 1); + } + + // Removes the record (or the duplicate of it, if |duplicate_index| is > 0). + // If |all_duplicates| is set then all duplicates of this key are deleted. + // |has_duplicates_left| will return true if there are more duplicates left + // after the current one was deleted. + virtual void erase_record(Context *context, int slot, int duplicate_index, + bool all_duplicates, bool *has_duplicates_left) { + ham_assert(slot < (int)get_count()); + m_impl.erase_record(context, slot, duplicate_index, all_duplicates); + if (has_duplicates_left) + *has_duplicates_left = get_record_count(context, slot) > 0; + } + + // Erases all extended keys, overflow areas and records that are + // linked from this page; usually called when the Database is deleted + // or an In-Memory Database is closed + virtual void remove_all_entries(Context *context) { + size_t node_count = get_count(); + for (size_t i = 0; i < node_count; i++) { + m_impl.erase_extended_key(context, i); + + // If we're in the leaf page, delete the associated record. (Only + // leaf nodes have records; internal nodes have record IDs that + // reference other pages, and these pages must not be deleted.) + if (is_leaf()) + erase_record(context, i, 0, true, 0); + } + } + + // High level function to insert a new key. Only inserts the key. The + // actual record is then updated with |set_record|. + virtual PBtreeNode::InsertResult insert(Context *context, + ham_key_t *key, uint32_t flags) { + PBtreeNode::InsertResult result(0, 0); + if (m_impl.requires_split(context, key)) { + result.status = HAM_LIMITS_REACHED; + return (result); + } + + Comparator cmp(m_page->get_db()); + try { + result = m_impl.insert(context, key, flags, cmp); + } + catch (Exception &ex) { + result.status = ex.code; + } + + // split required? then reorganize the node, try again + if (result.status == HAM_LIMITS_REACHED) { + try { + if (m_impl.reorganize(context, key)) + result = m_impl.insert(context, key, flags, cmp); + } + catch (Exception &ex) { + result.status = ex.code; + } + } + + if (result.status == HAM_SUCCESS) + set_count(get_count() + 1); + + return (result); + } + + // Returns true if a node requires a split to insert |key| + virtual bool requires_split(Context *context, const ham_key_t *key = 0) { + return (m_impl.requires_split(context, key)); + } + + // Returns true if a node requires a merge or a shift + virtual bool requires_merge() const { + return (m_impl.requires_merge()); + } + + // Splits the node + virtual void split(Context *context, BtreeNodeProxy *other_node, + int pivot) { + ClassType *other = dynamic_cast<ClassType *>(other_node); + ham_assert(other != 0); + + m_impl.split(context, &other->m_impl, pivot); + + size_t node_count = get_count(); + set_count(pivot); + + if (is_leaf()) + other->set_count(node_count - pivot); + else + other->set_count(node_count - pivot - 1); + } + + // Merges all keys from the |other| node into this node + virtual void merge_from(Context *context, BtreeNodeProxy *other_node) { + ClassType *other = dynamic_cast<ClassType *>(other_node); + ham_assert(other != 0); + + m_impl.merge_from(context, &other->m_impl); + + set_count(get_count() + other->get_count()); + other->set_count(0); + } + + // Fills the btree_metrics structure + virtual void fill_metrics(btree_metrics_t *metrics) { + m_impl.fill_metrics(metrics, get_count()); + } + + // Prints the node to stdout (for debugging) + virtual void print(Context *context, size_t node_count = 0) { + std::cout << "page " << m_page->get_address() << ": " << get_count() + << " elements (leaf: " << (is_leaf() ? 1 : 0) << ", left: " + << get_left() << ", right: " << get_right() << ", ptr_down: " + << get_ptr_down() << ")" << std::endl; + if (!node_count) + node_count = get_count(); + for (size_t i = 0; i < node_count; i++) + m_impl.print(context, i); + } + + // Returns the class name. Only for testing! Uses the functions exported + // by abi.h, which are only available on assorted platforms. Other + // platforms will return empty strings. + virtual std::string test_get_classname() const { + return (get_classname(*this)); + } + + private: + NodeImpl m_impl; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_NODE_PROXY_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_base.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_base.h new file mode 100644 index 0000000000..6128c8834d --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_base.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Base class for RecordLists + * + * @exception_safe: nothrow + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_RECORDS_BASE_H +#define HAM_BTREE_RECORDS_BASE_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct BaseRecordList +{ + BaseRecordList() + : m_range_size(0) { + } + + // Checks the integrity of this node. Throws an exception if there is a + // violation. + void check_integrity(Context *context, size_t node_count) const { + } + + // Rearranges the list + void vacuumize(size_t node_count, bool force) const { + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BtreeStatistics::update_min_max_avg(&metrics->recordlist_ranges, + m_range_size); + } + + // The size of the range (in bytes) + size_t m_range_size; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_RECORDS_BASE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_default.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_default.h new file mode 100644 index 0000000000..6fcb6f1cb7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_default.h @@ -0,0 +1,424 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The DefaultRecordList provides simplified access to a list of records, + * where each record is either a 8-byte record identifier (specifying the + * address of a blob) or is stored inline, if the record's size is <= 8 bytes. + * + * Stores 1 byte of flags per record (see btree_flags.h). + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_RECORDS_DEFAULT_H +#define HAM_BTREE_RECORDS_DEFAULT_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_node.h" +#include "3btree/btree_records_base.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The template classes in this file are wrapped in a separate namespace +// to avoid naming clashes with btree_impl_default.h +// +namespace PaxLayout { + +class DefaultRecordList : public BaseRecordList +{ + public: + enum { + // A flag whether this RecordList has sequential data + kHasSequentialData = 1 + }; + + // Constructor + DefaultRecordList(LocalDatabase *db, PBtreeNode *node) + : m_db(db), m_flags(0), m_data(0) { + } + + // Sets the data pointer; required for initialization + void create(uint8_t *data, size_t range_size) { + size_t capacity = range_size / get_full_record_size(); + m_range_size = range_size; + + if (m_db->config().record_size == HAM_RECORD_SIZE_UNLIMITED) { + m_flags = data; + m_data = (uint64_t *)&data[capacity]; + } + else { + m_flags = 0; + m_data = (uint64_t *)data; + } + } + + // Opens an existing RecordList + void open(uint8_t *data, size_t range_size, size_t node_count) { + size_t capacity = range_size / get_full_record_size(); + m_range_size = range_size; + + if (m_db->config().record_size == HAM_RECORD_SIZE_UNLIMITED) { + m_flags = data; + m_data = (uint64_t *)&data[capacity]; + } + else { + m_flags = 0; + m_data = (uint64_t *)data; + } + } + + // Calculates the required size for a range + size_t get_required_range_size(size_t node_count) { + return (node_count * get_full_record_size()); + } + + // Returns the actual record size including overhead + size_t get_full_record_size() const { + return (sizeof(uint64_t) + + (m_db->config().record_size == HAM_RECORD_SIZE_UNLIMITED + ? 1 + : 0)); + } + + // Returns the record counter of a key + int get_record_count(Context *context, int slot) const { + if (unlikely(!is_record_inline(slot) && get_record_id(slot) == 0)) + return (0); + return (1); + } + + // Returns the record size + uint64_t get_record_size(Context *context, int slot, + int duplicate_index = 0) const { + if (is_record_inline(slot)) + return (get_inline_record_size(slot)); + + LocalEnvironment *env = m_db->lenv(); + return (env->blob_manager()->get_blob_size(context, get_record_id(slot))); + } + + // Returns the full record and stores it in |dest|; memory must be + // allocated by the caller + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, + int duplicate_index) const { + bool direct_access = (flags & HAM_DIRECT_ACCESS) != 0; + + // the record is stored inline + if (is_record_inline(slot)) { + record->size = get_inline_record_size(slot); + if (record->size == 0) { + record->data = 0; + return; + } + if (flags & HAM_PARTIAL) { + ham_trace(("flag HAM_PARTIAL is not allowed if record is " + "stored inline")); + throw Exception(HAM_INV_PARAMETER); + } + if (direct_access) + record->data = (void *)&m_data[slot]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &m_data[slot], record->size); + } + return; + } + + // the record is stored as a blob + LocalEnvironment *env = m_db->lenv(); + env->blob_manager()->read(context, get_record_id(slot), record, + flags, arena); + } + + // Updates the record of a key + void set_record(Context *context, int slot, int duplicate_index, + ham_record_t *record, uint32_t flags, + uint32_t *new_duplicate_index = 0) { + uint64_t ptr = get_record_id(slot); + LocalEnvironment *env = m_db->lenv(); + + // key does not yet exist + if (!ptr && !is_record_inline(slot)) { + // a new inline key is inserted + if (record->size <= sizeof(uint64_t)) { + set_record_data(slot, record->data, record->size); + } + // a new (non-inline) key is inserted + else { + ptr = env->blob_manager()->allocate(context, record, flags); + set_record_id(slot, ptr); + } + return; + } + + // an inline key exists + if (is_record_inline(slot)) { + // disable small/tiny/empty flags + set_record_flags(slot, get_record_flags(slot) + & ~(BtreeRecord::kBlobSizeSmall + | BtreeRecord::kBlobSizeTiny + | BtreeRecord::kBlobSizeEmpty)); + // ... and is overwritten with another inline key + if (record->size <= sizeof(uint64_t)) { + set_record_data(slot, record->data, record->size); + } + // ... or with a (non-inline) key + else { + ptr = env->blob_manager()->allocate(context, record, flags); + set_record_id(slot, ptr); + } + return; + } + + // a (non-inline) key exists + if (ptr) { + // ... and is overwritten by a inline key + if (record->size <= sizeof(uint64_t)) { + env->blob_manager()->erase(context, ptr); + set_record_data(slot, record->data, record->size); + } + // ... and is overwritten by a (non-inline) key + else { + ptr = env->blob_manager()->overwrite(context, ptr, record, flags); + set_record_id(slot, ptr); + } + return; + } + + ham_assert(!"shouldn't be here"); + throw Exception(HAM_INTERNAL_ERROR); + } + + // Erases the record + void erase_record(Context *context, int slot, int duplicate_index = 0, + bool all_duplicates = true) { + if (is_record_inline(slot)) { + remove_inline_record(slot); + return; + } + + // now erase the blob + m_db->lenv()->blob_manager()->erase(context, get_record_id(slot), 0); + set_record_id(slot, 0); + } + + // Erases a whole slot by shifting all larger records to the "left" + void erase(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count - 1) { + if (m_flags) + memmove(&m_flags[slot], &m_flags[slot + 1], node_count - slot - 1); + memmove(&m_data[slot], &m_data[slot + 1], + sizeof(uint64_t) * (node_count - slot - 1)); + } + } + + // Creates space for one additional record + void insert(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count) { + if (m_flags) + memmove(&m_flags[slot + 1], &m_flags[slot], node_count - slot); + memmove(&m_data[slot + 1], &m_data[slot], + sizeof(uint64_t) * (node_count - slot)); + } + if (m_flags) + m_flags[slot] = 0; + m_data[slot] = 0; + } + + // Copies |count| records from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, DefaultRecordList &dest, + size_t other_count, int dstart) { + if (m_flags) + memcpy(&dest.m_flags[dstart], &m_flags[sstart], (node_count - sstart)); + memcpy(&dest.m_data[dstart], &m_data[sstart], + sizeof(uint64_t) * (node_count - sstart)); + } + + // Sets the record id + void set_record_id(int slot, uint64_t ptr) { + m_data[slot] = ptr; + } + + // Returns the record id + uint64_t get_record_id(int slot, int duplicate_index = 0) const { + return (m_data[slot]); + } + + // Returns true if there's not enough space for another record + bool requires_split(size_t node_count) const { + return ((node_count + 1) * get_full_record_size() >= m_range_size); + } + + // Change the capacity; for PAX layouts this just means copying the + // data from one place to the other + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + size_t new_capacity = capacity_hint + ? capacity_hint + : new_range_size / get_full_record_size(); + // shift "to the right"? then first shift key data, otherwise + // the flags might overwrite the data + if (m_flags == 0) { + memmove(new_data_ptr, m_data, node_count * sizeof(uint64_t)); + } + else { + if (new_data_ptr > m_flags) { + memmove(&new_data_ptr[new_capacity], m_data, + node_count * sizeof(uint64_t)); + memmove(new_data_ptr, m_flags, node_count); + } + else { + memmove(new_data_ptr, m_flags, node_count); + memmove(&new_data_ptr[new_capacity], m_data, + node_count * sizeof(uint64_t)); + } + } + + if (m_db->config().record_size == HAM_RECORD_SIZE_UNLIMITED) { + m_flags = new_data_ptr; + m_data = (uint64_t *)&new_data_ptr[new_capacity]; + } + else { + m_flags = 0; + m_data = (uint64_t *)new_data_ptr; + } + m_range_size = new_range_size; + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseRecordList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_unused, + m_range_size - get_required_range_size(node_count)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) const { + out << "(" << get_record_size(context, slot) << " bytes)"; + } + + private: + // Sets record data + void set_record_data(int slot, const void *ptr, size_t size) { + uint8_t flags = get_record_flags(slot); + flags &= ~(BtreeRecord::kBlobSizeSmall + | BtreeRecord::kBlobSizeTiny + | BtreeRecord::kBlobSizeEmpty); + + if (size == 0) { + m_data[slot] = 0; + set_record_flags(slot, flags | BtreeRecord::kBlobSizeEmpty); + } + else if (size < 8) { + /* the highest byte of the record id is the size of the blob */ + char *p = (char *)&m_data[slot]; + p[sizeof(uint64_t) - 1] = size; + memcpy(&m_data[slot], ptr, size); + set_record_flags(slot, flags | BtreeRecord::kBlobSizeTiny); + } + else if (size == 8) { + memcpy(&m_data[slot], ptr, size); + set_record_flags(slot, flags | BtreeRecord::kBlobSizeSmall); + } + else { + ham_assert(!"shouldn't be here"); + set_record_flags(slot, flags); + } + } + + // Returns the record flags of a given |slot| + uint8_t get_record_flags(int slot, int duplicate_index = 0) + const { + return (m_flags ? m_flags[slot] : 0); + } + + // Sets the record flags of a given |slot| + void set_record_flags(int slot, uint8_t flags) { + ham_assert(m_flags != 0); + m_flags[slot] = flags; + } + + // Returns the size of an inline record + uint32_t get_inline_record_size(int slot) const { + uint8_t flags = get_record_flags(slot); + ham_assert(is_record_inline(slot)); + if (flags & BtreeRecord::kBlobSizeTiny) { + /* the highest byte of the record id is the size of the blob */ + char *p = (char *)&m_data[slot]; + return (p[sizeof(uint64_t) - 1]); + } + if (flags & BtreeRecord::kBlobSizeSmall) + return (sizeof(uint64_t)); + if (flags & BtreeRecord::kBlobSizeEmpty) + return (0); + ham_assert(!"shouldn't be here"); + return (0); + } + + // Returns true if the record is inline, false if the record is a blob + bool is_record_inline(int slot) const { + uint8_t flags = get_record_flags(slot); + return ((flags & BtreeRecord::kBlobSizeTiny) + || (flags & BtreeRecord::kBlobSizeSmall) + || (flags & BtreeRecord::kBlobSizeEmpty) != 0); + } + + // Removes an inline record; returns the updated record flags + void remove_inline_record(int slot) { + uint8_t flags = get_record_flags(slot); + m_data[slot] = 0; + set_record_flags(slot, + flags & ~(BtreeRecord::kBlobSizeSmall + | BtreeRecord::kBlobSizeTiny + | BtreeRecord::kBlobSizeEmpty)); + } + + // The parent database of this btree + LocalDatabase *m_db; + + // The record flags + uint8_t *m_flags; + + // The actual record data - an array of 64bit record IDs + uint64_t *m_data; +}; + +} // namespace PaxLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_RECORDS_DEFAULT_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_duplicate.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_duplicate.h new file mode 100644 index 0000000000..861f7a7640 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_duplicate.h @@ -0,0 +1,1557 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * RecordList implementations for duplicate records + * + * Duplicate records are stored inline till a certain threshold limit + * (m_duptable_threshold) is reached. In this case the duplicates are stored + * in a separate blob (the DuplicateTable), and the previously occupied storage + * in the node is reused for other records. + * + * Since records therefore have variable length, an UpfrontIndex is used + * (see btree_keys_varlen.h). + * + * This file has two RecordList implementations: + * + * - DuplicateRecordList: stores regular records as duplicates; records + * are stored as blobs if their size exceeds 8 bytes. Otherwise + * they are stored inline. + * + * - DuplicateInlineRecordList: stores small fixed length records as + * duplicates + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_RECORDS_DUPLICATE_H +#define HAM_BTREE_RECORDS_DUPLICATE_H + +#include "0root/root.h" + +#include <algorithm> +#include <iostream> +#include <vector> +#include <map> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/scoped_ptr.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_node.h" +#include "3btree/btree_index.h" +#include "3btree/upfront_index.h" +#include "3btree/btree_records_base.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +namespace DefLayout { + +// helper function which returns true if a record is inline +static bool is_record_inline(uint8_t flags) { + return (flags != 0); +} + +// +// A helper class for dealing with extended duplicate tables +// +// Byte [0..3] - count +// [4..7] - capacity +// [8.. [ - the record list +// if m_inline_records: +// each record has n bytes record-data +// else +// each record has 1 byte flags, n bytes record-data +// +class DuplicateTable +{ + public: + // Constructor; the flag |inline_records| indicates whether record + // flags should be stored for each record. |record_size| is the + // fixed length size of each record, or HAM_RECORD_SIZE_UNLIMITED + DuplicateTable(LocalDatabase *db, bool inline_records, size_t record_size) + : m_db(db), m_store_flags(!inline_records), m_record_size(record_size), + m_inline_records(inline_records), m_table_id(0) { + } + + // Allocates and fills the table; returns the new table id. + // Can allocate empty tables (required for testing purposes). + // The initial capacity of the table is twice the current + // |record_count|. + uint64_t create(Context *context, const uint8_t *data, + size_t record_count) { + ham_assert(m_table_id == 0); + + // This sets the initial capacity as described above + size_t capacity = record_count * 2; + m_table.resize(8 + capacity * get_record_width()); + if (likely(record_count > 0)) + m_table.overwrite(8, data, (m_inline_records + ? m_record_size * record_count + : 9 * record_count)); + + set_record_count(record_count); + set_record_capacity(record_count * 2); + + // Flush the table to disk, returns the blob-id of the table + return (flush_duplicate_table(context)); + } + + // Reads the table from disk + void open(Context *context, uint64_t table_id) { + ham_record_t record = {0}; + m_db->lenv()->blob_manager()->read(context, table_id, + &record, HAM_FORCE_DEEP_COPY, &m_table); + m_table_id = table_id; + } + + // Returns the number of duplicates in that table + int get_record_count() const { + ham_assert(m_table.get_size() > 4); + return ((int) *(uint32_t *)m_table.get_ptr()); + } + + // Returns the record size of a duplicate + uint64_t get_record_size(Context *context, int duplicate_index) { + ham_assert(duplicate_index < get_record_count()); + if (m_inline_records) + return (m_record_size); + ham_assert(m_store_flags == true); + + uint8_t *precord_flags; + uint8_t *p = get_record_data(duplicate_index, &precord_flags); + uint8_t flags = *precord_flags; + + if (flags & BtreeRecord::kBlobSizeTiny) + return (p[sizeof(uint64_t) - 1]); + if (flags & BtreeRecord::kBlobSizeSmall) + return (sizeof(uint64_t)); + if (flags & BtreeRecord::kBlobSizeEmpty) + return (0); + + uint64_t blob_id = *(uint64_t *)p; + return (m_db->lenv()->blob_manager()->get_blob_size(context, blob_id)); + } + + // Returns the full record and stores it in |record|. |flags| can + // be 0 or |HAM_DIRECT_ACCESS|, |HAM_PARTIAL|. These are the default + // flags of ham_db_find et al. + void get_record(Context *context, ByteArray *arena, ham_record_t *record, + uint32_t flags, int duplicate_index) { + ham_assert(duplicate_index < get_record_count()); + bool direct_access = (flags & HAM_DIRECT_ACCESS) != 0; + + uint8_t *precord_flags; + uint8_t *p = get_record_data(duplicate_index, &precord_flags); + uint8_t record_flags = precord_flags ? *precord_flags : 0; + + if (m_inline_records) { + if (flags & HAM_PARTIAL) { + ham_trace(("flag HAM_PARTIAL is not allowed if record is " + "stored inline")); + throw Exception(HAM_INV_PARAMETER); + } + + record->size = m_record_size; + if (direct_access) + record->data = p; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, p, m_record_size); + } + return; + } + + ham_assert(m_store_flags == true); + + if (record_flags & BtreeRecord::kBlobSizeEmpty) { + record->data = 0; + record->size = 0; + return; + } + + if (record_flags & BtreeRecord::kBlobSizeTiny) { + record->size = p[sizeof(uint64_t) - 1]; + if (direct_access) + record->data = &p[0]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &p[0], record->size); + } + return; + } + + if (record_flags & BtreeRecord::kBlobSizeSmall) { + record->size = sizeof(uint64_t); + if (direct_access) + record->data = &p[0]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &p[0], record->size); + } + return; + } + + uint64_t blob_id = *(uint64_t *)p; + + // the record is stored as a blob + LocalEnvironment *env = m_db->lenv(); + env->blob_manager()->read(context, blob_id, record, flags, arena); + } + + // Updates the record of a key. Analog to the set_record() method + // of the NodeLayout class. Returns the new table id and the + // new duplicate index, if |new_duplicate_index| is not null. + uint64_t set_record(Context *context, int duplicate_index, + ham_record_t *record, uint32_t flags, + uint32_t *new_duplicate_index) { + BlobManager *blob_manager = m_db->lenv()->blob_manager(); + + // the duplicate is overwritten + if (flags & HAM_OVERWRITE) { + uint8_t *record_flags = 0; + uint8_t *p = get_record_data(duplicate_index, &record_flags); + + // the record is stored inline w/ fixed length? + if (m_inline_records) { + ham_assert(record->size == m_record_size); + memcpy(p, record->data, record->size); + return (flush_duplicate_table(context)); + } + // the existing record is a blob + if (!is_record_inline(*record_flags)) { + uint64_t ptr = *(uint64_t *)p; + // overwrite the blob record + if (record->size > sizeof(uint64_t)) { + *(uint64_t *)p = blob_manager->overwrite(context, ptr, + record, flags); + return (flush_duplicate_table(context)); + } + // otherwise delete it and continue + blob_manager->erase(context, ptr, 0); + } + } + + // If the key is not overwritten but inserted or appended: create a + // "gap" in the table + else { + int record_count = get_record_count(); + + // check for overflow + if (unlikely(record_count == std::numeric_limits<int>::max())) { + ham_log(("Duplicate table overflow")); + throw Exception(HAM_LIMITS_REACHED); + } + + // adjust flags + if (flags & HAM_DUPLICATE_INSERT_BEFORE && duplicate_index == 0) + flags |= HAM_DUPLICATE_INSERT_FIRST; + else if (flags & HAM_DUPLICATE_INSERT_AFTER) { + if (duplicate_index == record_count) + flags |= HAM_DUPLICATE_INSERT_LAST; + else { + flags |= HAM_DUPLICATE_INSERT_BEFORE; + duplicate_index++; + } + } + + // resize the table, if necessary + if (unlikely(record_count == get_record_capacity())) + grow_duplicate_table(); + + // handle overwrites or inserts/appends + if (flags & HAM_DUPLICATE_INSERT_FIRST) { + if (record_count) { + uint8_t *ptr = get_raw_record_data(0); + memmove(ptr + get_record_width(), ptr, + record_count * get_record_width()); + } + duplicate_index = 0; + } + else if (flags & HAM_DUPLICATE_INSERT_BEFORE) { + uint8_t *ptr = get_raw_record_data(duplicate_index); + memmove(ptr + get_record_width(), ptr, + (record_count - duplicate_index) * get_record_width()); + } + else // HAM_DUPLICATE_INSERT_LAST + duplicate_index = record_count; + + set_record_count(record_count + 1); + } + + uint8_t *record_flags = 0; + uint8_t *p = get_record_data(duplicate_index, &record_flags); + + // store record inline? + if (m_inline_records) { + ham_assert(m_record_size == record->size); + if (m_record_size > 0) + memcpy(p, record->data, record->size); + } + else if (record->size == 0) { + memcpy(p, "\0\0\0\0\0\0\0\0", 8); + *record_flags = BtreeRecord::kBlobSizeEmpty; + } + else if (record->size < sizeof(uint64_t)) { + p[sizeof(uint64_t) - 1] = (uint8_t)record->size; + memcpy(&p[0], record->data, record->size); + *record_flags = BtreeRecord::kBlobSizeTiny; + } + else if (record->size == sizeof(uint64_t)) { + memcpy(&p[0], record->data, record->size); + *record_flags = BtreeRecord::kBlobSizeSmall; + } + else { + *record_flags = 0; + uint64_t blob_id = blob_manager->allocate(context, record, flags); + memcpy(p, &blob_id, sizeof(blob_id)); + } + + if (new_duplicate_index) + *new_duplicate_index = duplicate_index; + + // write the duplicate table to disk and return the table-id + return (flush_duplicate_table(context)); + } + + // Deletes a record from the table; also adjusts the count. If + // |all_duplicates| is true or if the last element of the table is + // deleted then the table itself will also be deleted. Returns 0 + // if this is the case, otherwise returns the table id. + uint64_t erase_record(Context *context, int duplicate_index, + bool all_duplicates) { + int record_count = get_record_count(); + + if (record_count == 1 && duplicate_index == 0) + all_duplicates = true; + + if (all_duplicates) { + if (m_store_flags && !m_inline_records) { + for (int i = 0; i < record_count; i++) { + uint8_t *record_flags; + uint8_t *p = get_record_data(i, &record_flags); + if (is_record_inline(*record_flags)) + continue; + if (*(uint64_t *)p != 0) { + m_db->lenv()->blob_manager()->erase(context, *(uint64_t *)p); + *(uint64_t *)p = 0; + } + } + } + if (m_table_id != 0) + m_db->lenv()->blob_manager()->erase(context, m_table_id); + set_record_count(0); + m_table_id = 0; + return (0); + } + + ham_assert(record_count > 0 && duplicate_index < record_count); + + uint8_t *record_flags; + uint8_t *lhs = get_record_data(duplicate_index, &record_flags); + if (record_flags != 0 && *record_flags == 0 && !m_inline_records) { + m_db->lenv()->blob_manager()->erase(context, *(uint64_t *)lhs); + *(uint64_t *)lhs = 0; + } + + if (duplicate_index < record_count - 1) { + lhs = get_raw_record_data(duplicate_index); + uint8_t *rhs = lhs + get_record_width(); + memmove(lhs, rhs, get_record_width() + * (record_count - duplicate_index - 1)); + } + + // adjust the counter + set_record_count(record_count - 1); + + // write the duplicate table to disk and return the table-id + return (flush_duplicate_table(context)); + } + + // Returns the maximum capacity of elements in a duplicate table + // This method could be private, but it's required by the unittests + int get_record_capacity() const { + ham_assert(m_table.get_size() >= 8); + return ((int) *(uint32_t *)((uint8_t *)m_table.get_ptr() + 4)); + } + + private: + // Doubles the capacity of the ByteArray which backs the table + void grow_duplicate_table() { + int capacity = get_record_capacity(); + if (capacity == 0) + capacity = 8; + m_table.resize(8 + (capacity * 2) * get_record_width()); + set_record_capacity(capacity * 2); + } + + // Writes the modified duplicate table to disk; returns the new + // table-id + uint64_t flush_duplicate_table(Context *context) { + ham_record_t record = {0}; + record.data = m_table.get_ptr(); + record.size = m_table.get_size(); + if (!m_table_id) + m_table_id = m_db->lenv()->blob_manager()->allocate( + context, &record, 0); + else + m_table_id = m_db->lenv()->blob_manager()->overwrite( + context, m_table_id, &record, 0); + return (m_table_id); + } + + // Returns the size of a record structure in the ByteArray + size_t get_record_width() const { + if (m_inline_records) + return (m_record_size); + ham_assert(m_store_flags == true); + return (sizeof(uint64_t) + 1); + } + + // Returns a pointer to the record data (including flags) + uint8_t *get_raw_record_data(int duplicate_index) { + if (m_inline_records) + return ((uint8_t *)m_table.get_ptr() + + 8 + + m_record_size * duplicate_index); + else + return ((uint8_t *)m_table.get_ptr() + + 8 + + 9 * duplicate_index); + } + + // Returns a pointer to the record data, and the flags + uint8_t *get_record_data(int duplicate_index, + uint8_t **pflags = 0) { + uint8_t *p = get_raw_record_data(duplicate_index); + if (m_store_flags) { + if (pflags) + *pflags = p++; + else + p++; + } + else if (pflags) + *pflags = 0; + return (p); + } + + // Sets the number of used elements in a duplicate table + void set_record_count(int record_count) { + *(uint32_t *)m_table.get_ptr() = (uint32_t)record_count; + } + + // Sets the maximum capacity of elements in a duplicate table + void set_record_capacity(int capacity) { + ham_assert(m_table.get_size() >= 8); + *(uint32_t *)((uint8_t *)m_table.get_ptr() + 4) = (uint32_t)capacity; + } + + // The database + LocalDatabase *m_db; + + // Whether to store flags per record or not (true unless records + // have constant length) + bool m_store_flags; + + // The constant length record size, or HAM_RECORD_SIZE_UNLIMITED + size_t m_record_size; + + // Stores the actual data of the table + ByteArray m_table; + + // True if records are inline + bool m_inline_records; + + // The blob id for persisting the table + uint64_t m_table_id; +}; + +// +// Common functions for duplicate record lists +// +class DuplicateRecordList : public BaseRecordList +{ + protected: + // for caching external duplicate tables + typedef std::map<uint64_t, DuplicateTable *> DuplicateTableCache; + + public: + enum { + // A flag whether this RecordList has sequential data + kHasSequentialData = 0 + }; + + // Constructor + DuplicateRecordList(LocalDatabase *db, PBtreeNode *node, + bool store_flags, size_t record_size) + : m_db(db), m_node(node), m_index(db), m_data(0), + m_store_flags(store_flags), m_record_size(record_size) { + size_t page_size = db->lenv()->config().page_size_bytes; + if (Globals::ms_duplicate_threshold) + m_duptable_threshold = Globals::ms_duplicate_threshold; + else { + if (page_size == 1024) + m_duptable_threshold = 8; + else if (page_size <= 1024 * 8) + m_duptable_threshold = 12; + else if (page_size <= 1024 * 16) + m_duptable_threshold = 20; + else if (page_size <= 1024 * 32) + m_duptable_threshold = 32; + else { + // 0x7f/127 is the maximum that we can store in the record + // counter (7 bits), but we won't exploit this fully + m_duptable_threshold = 64; + } + } + + // UpfrontIndex's chunk_size is just 1 byte (max 255); make sure that + // the duplicate list fits into a single chunk! + size_t rec_size = m_record_size; + if (rec_size == HAM_RECORD_SIZE_UNLIMITED) + rec_size = 9; + if (m_duptable_threshold * rec_size > 250) + m_duptable_threshold = 250 / rec_size; + } + + // Destructor - clears the cache + ~DuplicateRecordList() { + if (m_duptable_cache) { + for (DuplicateTableCache::iterator it = m_duptable_cache->begin(); + it != m_duptable_cache->end(); it++) + delete it->second; + } + } + + // Opens an existing RecordList + void open(uint8_t *ptr, size_t range_size, size_t node_count) { + m_data = ptr; + m_index.open(m_data, range_size); + m_range_size = range_size; + } + + // Returns a duplicate table; uses a cache to speed up access + DuplicateTable *get_duplicate_table(Context *context, uint64_t table_id) { + if (!m_duptable_cache) + m_duptable_cache.reset(new DuplicateTableCache()); + else { + DuplicateTableCache::iterator it = m_duptable_cache->find(table_id); + if (it != m_duptable_cache->end()) + return (it->second); + } + + DuplicateTable *dt = new DuplicateTable(m_db, !m_store_flags, + m_record_size); + dt->open(context, table_id); + (*m_duptable_cache)[table_id] = dt; + return (dt); + } + + // Updates the DupTableCache and changes the table id of a DuplicateTable. + // Called whenever a DuplicateTable's size increases, and the new blob-id + // differs from the old one. + void update_duplicate_table_id(DuplicateTable *dt, + uint64_t old_table_id, uint64_t new_table_id) { + m_duptable_cache->erase(old_table_id); + (*m_duptable_cache)[new_table_id] = dt; + } + + // Erases a slot. Only updates the UpfrontIndex; does NOT delete the + // record blobs! + void erase(Context *context, size_t node_count, int slot) { + m_index.erase(node_count, slot); + } + + // Inserts a slot for one additional record + void insert(Context *context, size_t node_count, int slot) { + m_index.insert(node_count, slot); + } + + // Copies |count| items from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, + DuplicateRecordList &dest, size_t other_node_count, + int dstart) { + // make sure that the other node has sufficient capacity in its + // UpfrontIndex + dest.m_index.change_range_size(other_node_count, 0, 0, + m_index.get_capacity()); + + uint32_t doffset; + for (size_t i = 0; i < node_count - sstart; i++) { + size_t size = m_index.get_chunk_size(sstart + i); + + dest.m_index.insert(other_node_count + i, dstart + i); + // destination offset + doffset = dest.m_index.allocate_space(other_node_count + i + 1, + dstart + i, size); + doffset = dest.m_index.get_absolute_offset(doffset); + // source offset + uint32_t soffset = m_index.get_chunk_offset(sstart + i); + soffset = m_index.get_absolute_offset(soffset); + // copy the data + memcpy(&dest.m_data[doffset], &m_data[soffset], size); + } + + // After copying, the caller will reduce the node count drastically. + // Therefore invalidate the cached next_offset. + m_index.invalidate_next_offset(); + } + + // Rearranges the list + void vacuumize(size_t node_count, bool force) { + if (force) + m_index.increase_vacuumize_counter(100); + m_index.maybe_vacuumize(node_count); + } + + protected: + // The database + LocalDatabase *m_db; + + // The current node + PBtreeNode *m_node; + + // The index which manages variable length chunks + UpfrontIndex m_index; + + // The actual data of the node + uint8_t *m_data; + + // Whether record flags are required + bool m_store_flags; + + // The constant record size, or HAM_RECORD_SIZE_UNLIMITED + size_t m_record_size; + + // The duplicate threshold + size_t m_duptable_threshold; + + // A cache for duplicate tables + ScopedPtr<DuplicateTableCache> m_duptable_cache; +}; + +// +// RecordList for records with fixed length, with duplicates. It uses +// an UpfrontIndex to manage the variable length chunks. +// +// If a key has duplicates, then all duplicates are stored sequentially. +// If that duplicate list exceeds a certain threshold then they are moved +// to a DuplicateTable, which is stored as a blob. +// +// Format for each slot: +// +// 1 byte meta data +// bit 1 - 7: duplicate counter, if kExtendedDuplicates == 0 +// bit 8: kExtendedDuplicates +// if kExtendedDuplicates == 0: +// <counter> * <length> bytes +// <length> byte data (always inline) +// if kExtendedDuplicates == 1: +// 8 byte: record id of the extended duplicate table +// +class DuplicateInlineRecordList : public DuplicateRecordList +{ + public: + // Constructor + DuplicateInlineRecordList(LocalDatabase *db, PBtreeNode *node) + : DuplicateRecordList(db, node, false, db->config().record_size), + m_record_size(db->config().record_size) { + } + + // Creates a new RecordList starting at |data| + void create(uint8_t *data, size_t range_size) { + m_data = data; + m_index.create(m_data, range_size, range_size / get_full_record_size()); + m_range_size = range_size; + } + + // Calculates the required size for a range with the specified |capacity| + size_t get_required_range_size(size_t node_count) const { + return (m_index.get_required_range_size(node_count)); + } + + // Returns the actual record size including overhead + size_t get_full_record_size() const { + return (1 + m_record_size + m_index.get_full_index_size()); + } + + // Returns the number of duplicates for a slot + int get_record_count(Context *context, int slot) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + if (m_data[offset] & BtreeRecord::kExtendedDuplicates) { + DuplicateTable *dt = get_duplicate_table(context, get_record_id(slot)); + return ((int)dt->get_record_count()); + } + + return (m_data[offset] & 0x7f); + } + + // Returns the size of a record; the size is always constant + uint64_t get_record_size(Context *context, int slot, + int duplicate_index = 0) const { + return (m_record_size); + } + + // Returns the full record and stores it in |dest| + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, + int duplicate_index) { + // forward to duplicate table? + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + if (unlikely(m_data[offset] & BtreeRecord::kExtendedDuplicates)) { + DuplicateTable *dt = get_duplicate_table(context, get_record_id(slot)); + dt->get_record(context, arena, record, flags, duplicate_index); + return; + } + + if (flags & HAM_PARTIAL) { + ham_trace(("flag HAM_PARTIAL is not allowed if record is " + "stored inline")); + throw Exception(HAM_INV_PARAMETER); + } + + ham_assert(duplicate_index < (int)get_inline_record_count(slot)); + bool direct_access = (flags & HAM_DIRECT_ACCESS) != 0; + + // the record is always stored inline + const uint8_t *ptr = get_record_data(slot, duplicate_index); + record->size = m_record_size; + if (direct_access) + record->data = (void *)ptr; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, ptr, m_record_size); + } + } + + // Adds or overwrites a record + void set_record(Context *context, int slot, int duplicate_index, + ham_record_t *record, uint32_t flags, + uint32_t *new_duplicate_index = 0) { + uint32_t chunk_offset = m_index.get_absolute_chunk_offset(slot); + uint32_t current_size = m_index.get_chunk_size(slot); + + ham_assert(m_record_size == record->size); + + // if the slot was not yet allocated: allocate new space, initialize + // it and then overwrite the record + if (current_size == 0) { + duplicate_index = 0; + flags |= HAM_OVERWRITE; + chunk_offset = m_index.allocate_space(m_node->get_count(), slot, + 1 + m_record_size); + chunk_offset = m_index.get_absolute_offset(chunk_offset); + // clear the flags + m_data[chunk_offset] = 0; + + set_inline_record_count(slot, 1); + } + + // if there's no duplicate table, but we're not able to add another + // duplicate because of size constraints, then offload all + // existing duplicates to an external DuplicateTable + uint32_t record_count = get_inline_record_count(slot); + size_t required_size = 1 + (record_count + 1) * m_record_size; + + if (!(m_data[chunk_offset] & BtreeRecord::kExtendedDuplicates) + && !(flags & HAM_OVERWRITE)) { + bool force_duptable = record_count >= m_duptable_threshold; + if (!force_duptable + && !m_index.can_allocate_space(m_node->get_count(), + required_size)) + force_duptable = true; + + // update chunk_offset - it might have been modified if + // m_index.can_allocate_space triggered a vacuumize() operation + chunk_offset = m_index.get_absolute_chunk_offset(slot); + + // already too many duplicates, or the record does not fit? then + // allocate an overflow duplicate list and move all duplicates to + // this list + if (force_duptable) { + DuplicateTable *dt = new DuplicateTable(m_db, !m_store_flags, + m_record_size); + uint64_t table_id = dt->create(context, get_record_data(slot, 0), + record_count); + if (!m_duptable_cache) + m_duptable_cache.reset(new DuplicateTableCache()); + (*m_duptable_cache)[table_id] = dt; + + // write the id of the duplicate table + if (m_index.get_chunk_size(slot) < 8 + 1) { + // do not erase the slot because it occupies so little space + size_t node_count = m_node->get_count(); + // force a split in the caller if the duplicate table cannot + // be inserted + if (!m_index.can_allocate_space(node_count, 8 + 1)) + throw Exception(HAM_LIMITS_REACHED); + m_index.allocate_space(node_count, slot, 8 + 1); + chunk_offset = m_index.get_absolute_chunk_offset(slot); + } + + m_data[chunk_offset] |= BtreeRecord::kExtendedDuplicates; + set_record_id(slot, table_id); + set_inline_record_count(slot, 0); + + m_index.set_chunk_size(slot, 8 + 1); + m_index.increase_vacuumize_counter(m_index.get_chunk_size(slot) - 9); + m_index.invalidate_next_offset(); + + // fall through + } + } + + // forward to duplicate table? + if (unlikely(m_data[chunk_offset] & BtreeRecord::kExtendedDuplicates)) { + uint64_t table_id = get_record_id(slot); + DuplicateTable *dt = get_duplicate_table(context, table_id); + uint64_t new_table_id = dt->set_record(context, duplicate_index, record, + flags, new_duplicate_index); + if (new_table_id != table_id) { + update_duplicate_table_id(dt, table_id, new_table_id); + set_record_id(slot, new_table_id); + } + return; + } + + // the duplicate is overwritten + if (flags & HAM_OVERWRITE) { + // the record is always stored inline w/ fixed length + uint8_t *p = (uint8_t *)get_record_data(slot, duplicate_index); + memcpy(p, record->data, record->size); + return; + } + + // Allocate new space for the duplicate table, if required + if (current_size < required_size) { + uint8_t *oldp = &m_data[chunk_offset]; + uint32_t old_chunk_size = m_index.get_chunk_size(slot); + uint32_t old_chunk_offset = m_index.get_chunk_offset(slot); + uint32_t new_chunk_offset = m_index.allocate_space(m_node->get_count(), + slot, required_size); + chunk_offset = m_index.get_absolute_offset(new_chunk_offset); + if (current_size > 0 && old_chunk_offset != new_chunk_offset) { + memmove(&m_data[chunk_offset], oldp, current_size); + m_index.add_to_freelist(m_node->get_count(), old_chunk_offset, + old_chunk_size); + } + } + + // adjust flags + if (flags & HAM_DUPLICATE_INSERT_BEFORE && duplicate_index == 0) + flags |= HAM_DUPLICATE_INSERT_FIRST; + else if (flags & HAM_DUPLICATE_INSERT_AFTER) { + if (duplicate_index == (int)record_count) + flags |= HAM_DUPLICATE_INSERT_LAST; + else { + flags |= HAM_DUPLICATE_INSERT_BEFORE; + duplicate_index++; + } + } + + // handle overwrites or inserts/appends + if (flags & HAM_DUPLICATE_INSERT_FIRST) { + if (record_count > 0) { + uint8_t *ptr = get_record_data(slot, 0); + memmove(get_record_data(slot, 1), ptr, record_count * m_record_size); + } + duplicate_index = 0; + } + else if (flags & HAM_DUPLICATE_INSERT_BEFORE) { + memmove(get_record_data(slot, duplicate_index), + get_record_data(slot, duplicate_index + 1), + (record_count - duplicate_index) * m_record_size); + } + else // HAM_DUPLICATE_INSERT_LAST + duplicate_index = record_count; + + set_inline_record_count(slot, record_count + 1); + + // store the new record inline + if (m_record_size > 0) + memcpy(get_record_data(slot, duplicate_index), + record->data, record->size); + + if (new_duplicate_index) + *new_duplicate_index = duplicate_index; + } + + // Erases a record's blob (does not remove the slot!) + void erase_record(Context *context, int slot, int duplicate_index = 0, + bool all_duplicates = false) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + + // forward to external duplicate table? + if (unlikely(m_data[offset] & BtreeRecord::kExtendedDuplicates)) { + uint64_t table_id = get_record_id(slot); + DuplicateTable *dt = get_duplicate_table(context, table_id); + uint64_t new_table_id = dt->erase_record(context, duplicate_index, + all_duplicates); + if (new_table_id == 0) { + m_duptable_cache->erase(table_id); + set_record_id(slot, 0); + m_data[offset] &= ~BtreeRecord::kExtendedDuplicates; + delete dt; + } + else if (new_table_id != table_id) { + update_duplicate_table_id(dt, table_id, new_table_id); + set_record_id(slot, new_table_id); + } + return; + } + + // there's only one record left which is erased? + size_t node_count = get_inline_record_count(slot); + if (node_count == 1 && duplicate_index == 0) + all_duplicates = true; + + // erase all duplicates? + if (all_duplicates) { + set_inline_record_count(slot, 0); + } + else { + if (duplicate_index < (int)node_count - 1) + memmove(get_record_data(duplicate_index), + get_record_data(duplicate_index + 1), + m_record_size * (node_count - duplicate_index - 1)); + set_inline_record_count(slot, node_count - 1); + } + } + + // Returns a 64bit record id from a record + uint64_t get_record_id(int slot, + int duplicate_index = 0) const { + return (*(uint64_t *)get_record_data(slot, duplicate_index)); + } + + // Sets a 64bit record id; used for internal nodes to store Page IDs + // or for leaf nodes to store DuplicateTable IDs + void set_record_id(int slot, uint64_t id) { + ham_assert(m_index.get_chunk_size(slot) >= sizeof(id)); + *(uint64_t *)get_record_data(slot, 0) = id; + } + + // Checks the integrity of this node. Throws an exception if there is a + // violation. + void check_integrity(Context *context, size_t node_count, + bool quick = false) const { + for (size_t i = 0; i < node_count; i++) { + uint32_t offset = m_index.get_absolute_chunk_offset(i); + if (m_data[offset] & BtreeRecord::kExtendedDuplicates) { + ham_assert((m_data[offset] & 0x7f) == 0); + } + } + + m_index.check_integrity(node_count); + } + + // Change the capacity; the capacity will be reduced, growing is not + // implemented. Which means that the data area must be copied; the offsets + // do not have to be changed. + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + // no capacity given? then try to find a good default one + if (capacity_hint == 0) { + capacity_hint = (new_range_size - m_index.get_next_offset(node_count) + - get_full_record_size()) / m_index.get_full_index_size(); + if (capacity_hint <= node_count) + capacity_hint = node_count + 1; + } + + // if there's not enough space for the new capacity then try to reduce + // the capacity + if (m_index.get_next_offset(node_count) + get_full_record_size() + + capacity_hint * m_index.get_full_index_size() + + UpfrontIndex::kPayloadOffset + > new_range_size) + capacity_hint = node_count + 1; + + m_index.change_range_size(node_count, new_data_ptr, new_range_size, + capacity_hint); + m_data = new_data_ptr; + m_range_size = new_range_size; + } + + // Returns true if there's not enough space for another record + bool requires_split(size_t node_count) { + // if the record is extremely small then make sure there's some headroom; + // this is required for DuplicateTable ids which are 64bit numbers + size_t required = get_full_record_size(); + if (required < 10) + required = 10; + return (m_index.requires_split(node_count, required)); + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseRecordList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_index, + m_index.get_capacity() * m_index.get_full_index_size()); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_unused, + m_range_size - get_required_range_size(node_count)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) { + out << "(" << get_record_count(context, slot) << " records)"; + } + + private: + // Returns the number of records that are stored inline + uint32_t get_inline_record_count(int slot) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + return (m_data[offset] & 0x7f); + } + + // Sets the number of records that are stored inline + void set_inline_record_count(int slot, size_t count) { + ham_assert(count <= 0x7f); + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + m_data[offset] &= BtreeRecord::kExtendedDuplicates; + m_data[offset] |= count; + } + + // Returns a pointer to the record data + uint8_t *get_record_data(int slot, int duplicate_index = 0) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + return (&m_data[offset + 1 + m_record_size * duplicate_index]); + } + + // Returns a pointer to the record data (const flavour) + const uint8_t *get_record_data(int slot, + int duplicate_index = 0) const { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + return (&m_data[offset + 1 + m_record_size * duplicate_index]); + } + + // The constant length record size + size_t m_record_size; +}; + +// +// RecordList for default records (8 bytes; either inline or a record id), +// with duplicates +// +// Format for each slot: +// +// 1 byte meta data +// bit 1 - 7: duplicate counter, if kExtendedDuplicates == 0 +// bit 8: kExtendedDuplicates +// if kExtendedDuplicates == 0: +// <counter> * 9 bytes +// 1 byte flags (RecordFlag::*) +// 8 byte data (either inline or record-id) +// if kExtendedDuplicates == 1: +// 8 byte: record id of the extended duplicate table +// +class DuplicateDefaultRecordList : public DuplicateRecordList +{ + public: + // Constructor + DuplicateDefaultRecordList(LocalDatabase *db, PBtreeNode *node) + : DuplicateRecordList(db, node, true, HAM_RECORD_SIZE_UNLIMITED) { + } + + // Creates a new RecordList starting at |data| + void create(uint8_t *data, size_t range_size) { + m_data = data; + m_index.create(m_data, range_size, range_size / get_full_record_size()); + } + + // Calculates the required size for a range with the specified |capacity| + size_t get_required_range_size(size_t node_count) const { + return (m_index.get_required_range_size(node_count)); + } + + // Returns the actual key record including overhead + size_t get_full_record_size() const { + return (1 + 1 + 8 + m_index.get_full_index_size()); + } + + // Returns the number of duplicates + int get_record_count(Context *context, int slot) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + if (unlikely(m_data[offset] & BtreeRecord::kExtendedDuplicates)) { + DuplicateTable *dt = get_duplicate_table(context, get_record_id(slot)); + return ((int) dt->get_record_count()); + } + + return (m_data[offset] & 0x7f); + } + + // Returns the size of a record + uint64_t get_record_size(Context *context, int slot, + int duplicate_index = 0) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + if (unlikely(m_data[offset] & BtreeRecord::kExtendedDuplicates)) { + DuplicateTable *dt = get_duplicate_table(context, get_record_id(slot)); + return (dt->get_record_size(context, duplicate_index)); + } + + uint8_t *p = &m_data[offset + 1 + 9 * duplicate_index]; + uint8_t flags = *(p++); + if (flags & BtreeRecord::kBlobSizeTiny) + return (p[sizeof(uint64_t) - 1]); + if (flags & BtreeRecord::kBlobSizeSmall) + return (sizeof(uint64_t)); + if (flags & BtreeRecord::kBlobSizeEmpty) + return (0); + + LocalEnvironment *env = m_db->lenv(); + return (env->blob_manager()->get_blob_size(context, *(uint64_t *)p)); + } + + // Returns the full record and stores it in |dest|; memory must be + // allocated by the caller + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, int duplicate_index) { + // forward to duplicate table? + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + if (unlikely(m_data[offset] & BtreeRecord::kExtendedDuplicates)) { + DuplicateTable *dt = get_duplicate_table(context, get_record_id(slot)); + dt->get_record(context, arena, record, flags, duplicate_index); + return; + } + + ham_assert(duplicate_index < (int)get_inline_record_count(slot)); + bool direct_access = (flags & HAM_DIRECT_ACCESS) != 0; + + uint8_t *p = &m_data[offset + 1 + 9 * duplicate_index]; + uint8_t record_flags = *(p++); + + if (record_flags && (flags & HAM_PARTIAL)) { + ham_trace(("flag HAM_PARTIAL is not allowed if record is " + "stored inline")); + throw Exception(HAM_INV_PARAMETER); + } + + if (record_flags & BtreeRecord::kBlobSizeEmpty) { + record->data = 0; + record->size = 0; + return; + } + + if (record_flags & BtreeRecord::kBlobSizeTiny) { + record->size = p[sizeof(uint64_t) - 1]; + if (direct_access) + record->data = &p[0]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &p[0], record->size); + } + return; + } + + if (record_flags & BtreeRecord::kBlobSizeSmall) { + record->size = sizeof(uint64_t); + if (direct_access) + record->data = &p[0]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &p[0], record->size); + } + return; + } + + uint64_t blob_id = *(uint64_t *)p; + + // the record is stored as a blob + LocalEnvironment *env = m_db->lenv(); + env->blob_manager()->read(context, blob_id, record, flags, arena); + } + + // Updates the record of a key + void set_record(Context *context, int slot, int duplicate_index, + ham_record_t *record, uint32_t flags, + uint32_t *new_duplicate_index = 0) { + uint32_t chunk_offset = m_index.get_absolute_chunk_offset(slot); + uint32_t current_size = m_index.get_chunk_size(slot); + + // if the slot was not yet allocated: allocate new space, initialize + // it and then overwrite the record + if (current_size == 0) { + duplicate_index = 0; + flags |= HAM_OVERWRITE; + chunk_offset = m_index.allocate_space(m_node->get_count(), slot, 1 + 9); + chunk_offset = m_index.get_absolute_offset(chunk_offset); + // clear the record flags + m_data[chunk_offset] = 0; + m_data[chunk_offset + 1] = BtreeRecord::kBlobSizeEmpty; + + set_inline_record_count(slot, 1); + } + + // if there's no duplicate table, but we're not able to add another + // duplicate then offload all existing duplicates to a table + uint32_t record_count = get_inline_record_count(slot); + size_t required_size = 1 + (record_count + 1) * 9; + + if (!(m_data[chunk_offset] & BtreeRecord::kExtendedDuplicates) + && !(flags & HAM_OVERWRITE)) { + bool force_duptable = record_count >= m_duptable_threshold; + if (!force_duptable + && !m_index.can_allocate_space(m_node->get_count(), + required_size)) + force_duptable = true; + + // update chunk_offset - it might have been modified if + // m_index.can_allocate_space triggered a vacuumize() operation + chunk_offset = m_index.get_absolute_chunk_offset(slot); + + // already too many duplicates, or the record does not fit? then + // allocate an overflow duplicate list and move all duplicates to + // this list + if (force_duptable) { + DuplicateTable *dt = new DuplicateTable(m_db, !m_store_flags, + HAM_RECORD_SIZE_UNLIMITED); + uint64_t table_id = dt->create(context, get_record_data(slot, 0), + record_count); + if (!m_duptable_cache) + m_duptable_cache.reset(new DuplicateTableCache()); + (*m_duptable_cache)[table_id] = dt; + + // write the id of the duplicate table + if (m_index.get_chunk_size(slot) < 8 + 1) { + // do not erase the slot because it obviously occupies so + // little space + m_index.allocate_space(m_node->get_count(), slot, 8 + 1); + chunk_offset = m_index.get_absolute_chunk_offset(slot); + } + + m_data[chunk_offset] |= BtreeRecord::kExtendedDuplicates; + set_record_id(slot, table_id); + set_inline_record_count(slot, 0); + + m_index.set_chunk_size(slot, 10); + m_index.increase_vacuumize_counter(m_index.get_chunk_size(slot) - 10); + m_index.invalidate_next_offset(); + + // fall through + } + } + + // forward to duplicate table? + if (unlikely(m_data[chunk_offset] & BtreeRecord::kExtendedDuplicates)) { + uint64_t table_id = get_record_id(slot); + DuplicateTable *dt = get_duplicate_table(context, table_id); + uint64_t new_table_id = dt->set_record(context, duplicate_index, record, + flags, new_duplicate_index); + if (new_table_id != table_id) { + update_duplicate_table_id(dt, table_id, new_table_id); + set_record_id(slot, new_table_id); + } + return; + } + + uint64_t overwrite_blob_id = 0; + uint8_t *record_flags = 0; + uint8_t *p = 0; + + // the (inline) duplicate is overwritten + if (flags & HAM_OVERWRITE) { + record_flags = &m_data[chunk_offset + 1 + 9 * duplicate_index]; + p = record_flags + 1; + + // If a blob is overwritten with an inline record then the old blob + // has to be deleted + if (*record_flags == 0) { + if (record->size <= 8) { + uint64_t blob_id = *(uint64_t *)p; + if (blob_id) + m_db->lenv()->blob_manager()->erase(context, blob_id); + } + else + overwrite_blob_id = *(uint64_t *)p; + // fall through + } + // then jump to the code which performs the actual insertion + goto write_record; + } + + // Allocate new space for the duplicate table, if required + if (current_size < required_size) { + uint8_t *oldp = &m_data[chunk_offset]; + uint32_t old_chunk_size = m_index.get_chunk_size(slot); + uint32_t old_chunk_offset = m_index.get_chunk_offset(slot); + uint32_t new_chunk_offset = m_index.allocate_space(m_node->get_count(), + slot, required_size); + chunk_offset = m_index.get_absolute_offset(new_chunk_offset); + if (current_size > 0) + memmove(&m_data[chunk_offset], oldp, current_size); + if (old_chunk_offset != new_chunk_offset) + m_index.add_to_freelist(m_node->get_count(), old_chunk_offset, + old_chunk_size); + } + + // adjust flags + if (flags & HAM_DUPLICATE_INSERT_BEFORE && duplicate_index == 0) + flags |= HAM_DUPLICATE_INSERT_FIRST; + else if (flags & HAM_DUPLICATE_INSERT_AFTER) { + if (duplicate_index == (int)record_count) + flags |= HAM_DUPLICATE_INSERT_LAST; + else { + flags |= HAM_DUPLICATE_INSERT_BEFORE; + duplicate_index++; + } + } + + // handle overwrites or inserts/appends + if (flags & HAM_DUPLICATE_INSERT_FIRST) { + if (record_count > 0) { + uint8_t *ptr = &m_data[chunk_offset + 1]; + memmove(&m_data[chunk_offset + 1 + 9], ptr, record_count * 9); + } + duplicate_index = 0; + } + else if (flags & HAM_DUPLICATE_INSERT_BEFORE) { + memmove(&m_data[chunk_offset + 1 + 9 * (duplicate_index + 1)], + &m_data[chunk_offset + 1 + 9 * duplicate_index], + (record_count - duplicate_index) * 9); + } + else // HAM_DUPLICATE_INSERT_LAST + duplicate_index = record_count; + + set_inline_record_count(slot, record_count + 1); + + record_flags = &m_data[chunk_offset + 1 + 9 * duplicate_index]; + p = record_flags + 1; + +write_record: + if (record->size == 0) { + memcpy(p, "\0\0\0\0\0\0\0\0", 8); + *record_flags = BtreeRecord::kBlobSizeEmpty; + } + else if (record->size < sizeof(uint64_t)) { + p[sizeof(uint64_t) - 1] = (uint8_t)record->size; + memcpy(&p[0], record->data, record->size); + *record_flags = BtreeRecord::kBlobSizeTiny; + } + else if (record->size == sizeof(uint64_t)) { + memcpy(&p[0], record->data, record->size); + *record_flags = BtreeRecord::kBlobSizeSmall; + } + else { + LocalEnvironment *env = m_db->lenv(); + *record_flags = 0; + uint64_t blob_id; + if (overwrite_blob_id) + blob_id = env->blob_manager()->overwrite(context, + overwrite_blob_id, record, flags); + else + blob_id = env->blob_manager()->allocate(context, record, flags); + memcpy(p, &blob_id, sizeof(blob_id)); + } + + if (new_duplicate_index) + *new_duplicate_index = duplicate_index; + } + + // Erases a record + void erase_record(Context *context, int slot, int duplicate_index = 0, + bool all_duplicates = false) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + + // forward to external duplicate table? + if (unlikely(m_data[offset] & BtreeRecord::kExtendedDuplicates)) { + uint64_t table_id = get_record_id(slot); + DuplicateTable *dt = get_duplicate_table(context, table_id); + uint64_t new_table_id = dt->erase_record(context, duplicate_index, + all_duplicates); + if (new_table_id == 0) { + m_duptable_cache->erase(table_id); + set_record_id(slot, 0); + m_data[offset] &= ~BtreeRecord::kExtendedDuplicates; + delete dt; + } + else if (new_table_id != table_id) { + update_duplicate_table_id(dt, table_id, new_table_id); + set_record_id(slot, new_table_id); + } + return; + } + + // erase the last duplicate? + uint32_t count = get_inline_record_count(slot); + if (count == 1 && duplicate_index == 0) + all_duplicates = true; + + // adjust next_offset, if necessary. Note that get_next_offset() is + // called with a node_count of zero, which is valid (it avoids a + // recalculation in case there is no next_offset) + m_index.maybe_invalidate_next_offset(m_index.get_chunk_offset(slot) + + m_index.get_chunk_size(slot)); + + // erase all duplicates? + if (all_duplicates) { + for (uint32_t i = 0; i < count; i++) { + uint8_t *p = &m_data[offset + 1 + 9 * i]; + if (!is_record_inline(*p)) { + m_db->lenv()->blob_manager()->erase(context, *(uint64_t *)(p + 1)); + *(uint64_t *)(p + 1) = 0; + } + } + set_inline_record_count(slot, 0); + m_index.set_chunk_size(slot, 0); + } + else { + uint8_t *p = &m_data[offset + 1 + 9 * duplicate_index]; + if (!is_record_inline(*p)) { + m_db->lenv()->blob_manager()->erase(context, *(uint64_t *)(p + 1)); + *(uint64_t *)(p + 1) = 0; + } + if (duplicate_index < (int)count - 1) + memmove(&m_data[offset + 1 + 9 * duplicate_index], + &m_data[offset + 1 + 9 * (duplicate_index + 1)], + 9 * (count - duplicate_index - 1)); + set_inline_record_count(slot, count - 1); + } + } + + // Returns a record id + uint64_t get_record_id(int slot, + int duplicate_index = 0) const { + return (*(uint64_t *)get_record_data(slot, duplicate_index)); + } + + // Sets a record id + void set_record_id(int slot, uint64_t id) { + *(uint64_t *)get_record_data(slot, 0) = id; + } + + // Checks the integrity of this node. Throws an exception if there is a + // violation. + void check_integrity(Context *context, size_t node_count) const { + for (size_t i = 0; i < node_count; i++) { + uint32_t offset = m_index.get_absolute_chunk_offset(i); + if (m_data[offset] & BtreeRecord::kExtendedDuplicates) { + ham_assert((m_data[offset] & 0x7f) == 0); + } + } + + m_index.check_integrity(node_count); + } + + // Change the capacity; the capacity will be reduced, growing is not + // implemented. Which means that the data area must be copied; the offsets + // do not have to be changed. + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + // no capacity given? then try to find a good default one + if (capacity_hint == 0) { + capacity_hint = (new_range_size - m_index.get_next_offset(node_count) + - get_full_record_size()) / m_index.get_full_index_size(); + if (capacity_hint <= node_count) + capacity_hint = node_count + 1; + } + + // if there's not enough space for the new capacity then try to reduce + // the capacity + if (m_index.get_next_offset(node_count) + get_full_record_size() + + capacity_hint * m_index.get_full_index_size() + + UpfrontIndex::kPayloadOffset + > new_range_size) + capacity_hint = node_count + 1; + + m_index.change_range_size(node_count, new_data_ptr, new_range_size, + capacity_hint); + m_data = new_data_ptr; + m_range_size = new_range_size; + } + + // Returns true if there's not enough space for another record + bool requires_split(size_t node_count) { + // if the record is extremely small then make sure there's some headroom; + // this is required for DuplicateTable ids which are 64bit numbers + size_t required = get_full_record_size(); + if (required < 10) + required = 10; + return (m_index.requires_split(node_count, required)); + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseRecordList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_index, + m_index.get_capacity() * m_index.get_full_index_size()); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_unused, + m_range_size - get_required_range_size(node_count)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) { + out << "(" << get_record_count(context, slot) << " records)"; + } + + private: + // Returns the number of records that are stored inline + uint32_t get_inline_record_count(int slot) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + return (m_data[offset] & 0x7f); + } + + // Sets the number of records that are stored inline + void set_inline_record_count(int slot, size_t count) { + ham_assert(count <= 0x7f); + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + m_data[offset] &= BtreeRecord::kExtendedDuplicates; + m_data[offset] |= count; + } + + // Returns a pointer to the record data (const flavour) + uint8_t *get_record_data(int slot, int duplicate_index = 0) { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + return (&m_data[offset + 1 + 9 * duplicate_index]); + } + + // Returns a pointer to the record data (const flavour) + const uint8_t *get_record_data(int slot, + int duplicate_index = 0) const { + uint32_t offset = m_index.get_absolute_chunk_offset(slot); + return (&m_data[offset + 1 + 9 * duplicate_index]); + } +}; + +} // namespace DefLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_RECORDS_DUPLICATE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_inline.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_inline.h new file mode 100644 index 0000000000..6a7ac4ff35 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_inline.h @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * RecordList for Inline Records + * + * Inline Records are records that are stored directly in the leaf node, and + * not in an external blob. Only for fixed length records. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_RECORDS_INLINE_H +#define HAM_BTREE_RECORDS_INLINE_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_node.h" +#include "3btree/btree_records_base.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The template classes in this file are wrapped in a separate namespace +// to avoid naming clashes with btree_impl_default.h +// +namespace PaxLayout { + +class InlineRecordList : public BaseRecordList +{ + public: + enum { + // A flag whether this RecordList has sequential data + kHasSequentialData = 1 + }; + + // Constructor + InlineRecordList(LocalDatabase *db, PBtreeNode *node) + : m_db(db), m_record_size(db->config().record_size), m_data(0) { + ham_assert(m_record_size != HAM_RECORD_SIZE_UNLIMITED); + } + + // Sets the data pointer + void create(uint8_t *data, size_t range_size) { + m_data = (uint8_t *)data; + m_range_size = range_size; + } + + // Opens an existing RecordList + void open(uint8_t *ptr, size_t range_size, size_t node_count) { + m_data = ptr; + m_range_size = range_size; + } + + // Returns the actual record size including overhead + size_t get_full_record_size() const { + return (m_record_size); + } + + // Calculates the required size for a range with the specified |capacity| + size_t get_required_range_size(size_t node_count) const { + return (node_count * m_record_size); + } + + // Returns the record counter of a key + int get_record_count(Context *context, int slot) const { + return (1); + } + + // Returns the record size + uint64_t get_record_size(Context *context, int slot, + int duplicate_index = 0) const { + return (m_record_size); + } + + // Returns the full record and stores it in |dest|; memory must be + // allocated by the caller + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, + int duplicate_index) const { + bool direct_access = (flags & HAM_DIRECT_ACCESS) != 0; + + if (flags & HAM_PARTIAL) { + ham_trace(("flag HAM_PARTIAL is not allowed if record is " + "stored inline")); + throw Exception(HAM_INV_PARAMETER); + } + + // the record is stored inline + record->size = m_record_size; + + if (m_record_size == 0) + record->data = 0; + else if (direct_access) + record->data = &m_data[slot * m_record_size]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &m_data[slot * m_record_size], record->size); + } + } + + // Updates the record of a key + void set_record(Context *context, int slot, int duplicate_index, + ham_record_t *record, uint32_t flags, + uint32_t *new_duplicate_index = 0) { + ham_assert(record->size == m_record_size); + // it's possible that the records have size 0 - then don't copy anything + if (m_record_size) + memcpy(&m_data[m_record_size * slot], record->data, m_record_size); + } + + // Erases the record + void erase_record(Context *context, int slot, int duplicate_index = 0, + bool all_duplicates = true) { + if (m_record_size) + memset(&m_data[m_record_size * slot], 0, m_record_size); + } + + // Erases a whole slot by shifting all larger records to the "left" + void erase(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count - 1) + memmove(&m_data[m_record_size * slot], + &m_data[m_record_size * (slot + 1)], + m_record_size * (node_count - slot - 1)); + } + + // Creates space for one additional record + void insert(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count) { + memmove(&m_data[m_record_size * (slot + 1)], + &m_data[m_record_size * slot], + m_record_size * (node_count - slot)); + } + memset(&m_data[m_record_size * slot], 0, m_record_size); + } + + // Copies |count| records from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, InlineRecordList &dest, + size_t other_count, int dstart) { + memcpy(&dest.m_data[m_record_size * dstart], + &m_data[m_record_size * sstart], + m_record_size * (node_count - sstart)); + } + + // Returns the record id. Not required for fixed length leaf nodes + uint64_t get_record_id(int slot, int duplicate_index = 0) + const { + ham_assert(!"shouldn't be here"); + return (0); + } + + // Sets the record id. Not required for fixed length leaf nodes + void set_record_id(int slot, uint64_t ptr) { + ham_assert(!"shouldn't be here"); + } + + // Returns true if there's not enough space for another record + bool requires_split(size_t node_count) const { + if (m_range_size == 0) + return (false); + return ((node_count + 1) * m_record_size >= m_range_size); + } + + // Change the capacity; for PAX layouts this just means copying the + // data from one place to the other + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + memmove(new_data_ptr, m_data, node_count * m_record_size); + m_data = new_data_ptr; + m_range_size = new_range_size; + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseRecordList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_unused, + m_range_size - get_required_range_size(node_count)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) const { + out << "(" << get_record_size(context, slot) << " bytes)"; + } + + private: + // The parent database of this btree + LocalDatabase *m_db; + + // The record size, as specified when the database was created + size_t m_record_size; + + // The actual record data + uint8_t *m_data; +}; + +} // namespace PaxLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_RECORDS_INLINE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_internal.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_internal.h new file mode 100644 index 0000000000..9773119991 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_records_internal.h @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Internal RecordList + * + * Only for records of internal nodes. Internal nodes only store page IDs, + * therefore this |InternalRecordList| is optimized for 64bit IDs + * (and is implemented as a uint64_t[] array). + * + * For file-based databases the page IDs are stored modulo page size, which + * results in smaller IDs. Small IDs can be compressed more efficiently + * (-> hamsterdb pro). + * + * In-memory based databases just store the raw pointers. + * + * @exception_safe: nothrow + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_RECORDS_INTERNAL_H +#define HAM_BTREE_RECORDS_INTERNAL_H + +#include "0root/root.h" + +#include <sstream> +#include <iostream> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" +#include "1base/dynamic_array.h" +#include "2page/page.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_records_base.h" +#include "3btree/btree_node.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The template classes in this file are wrapped in a separate namespace +// to avoid naming clashes with btree_impl_default.h +// +namespace PaxLayout { + +class InternalRecordList : public BaseRecordList +{ + public: + enum { + // A flag whether this RecordList has sequential data + kHasSequentialData = 1 + }; + + // Constructor + InternalRecordList(LocalDatabase *db, PBtreeNode *node) + : m_db(db), m_data(0) { + m_page_size = m_db->lenv()->config().page_size_bytes; + m_store_raw_id = (m_db->lenv()->config().flags + & HAM_IN_MEMORY) == HAM_IN_MEMORY; + } + + // Sets the data pointer + void create(uint8_t *data, size_t range_size) { + m_data = (uint64_t *)data; + m_range_size = range_size; + } + + // Opens an existing RecordList + void open(uint8_t *ptr, size_t range_size, size_t node_count) { + m_data = (uint64_t *)ptr; + m_range_size = range_size; + } + + // Returns the actual size including overhead + size_t get_full_record_size() const { + return (sizeof(uint64_t)); + } + + // Calculates the required size for a range with the specified |capacity| + size_t get_required_range_size(size_t node_count) const { + return (node_count * sizeof(uint64_t)); + } + + // Returns the record counter of a key; this implementation does not + // support duplicates, therefore the record count is always 1 + int get_record_count(Context *context, int slot) const { + return (1); + } + + // Returns the record size + uint64_t get_record_size(Context *context, int slot, + int duplicate_index = 0) const { + return (sizeof(uint64_t)); + } + + // Returns the full record and stores it in |dest|; memory must be + // allocated by the caller + void get_record(Context *context, int slot, ByteArray *arena, + ham_record_t *record, uint32_t flags, + int duplicate_index) const { + bool direct_access = (flags & HAM_DIRECT_ACCESS) != 0; + + // the record is stored inline + record->size = sizeof(uint64_t); + + if (direct_access) + record->data = (void *)&m_data[slot]; + else { + if ((record->flags & HAM_RECORD_USER_ALLOC) == 0) { + arena->resize(record->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, &m_data[slot], record->size); + } + } + + // Updates the record of a key + void set_record(Context *context, int slot, int duplicate_index, + ham_record_t *record, uint32_t flags, + uint32_t *new_duplicate_index = 0) { + ham_assert(record->size == sizeof(uint64_t)); + m_data[slot] = *(uint64_t *)record->data; + } + + // Erases the record + void erase_record(Context *context, int slot, int duplicate_index = 0, + bool all_duplicates = true) { + m_data[slot] = 0; + } + + // Erases a whole slot by shifting all larger records to the "left" + void erase(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count - 1) + memmove(&m_data[slot], &m_data[slot + 1], + sizeof(uint64_t) * (node_count - slot - 1)); + } + + // Creates space for one additional record + void insert(Context *context, size_t node_count, int slot) { + if (slot < (int)node_count) { + memmove(&m_data[slot + 1], &m_data[slot], + sizeof(uint64_t) * (node_count - slot)); + } + m_data[slot] = 0; + } + + // Copies |count| records from this[sstart] to dest[dstart] + void copy_to(int sstart, size_t node_count, InternalRecordList &dest, + size_t other_count, int dstart) { + memcpy(&dest.m_data[dstart], &m_data[sstart], + sizeof(uint64_t) * (node_count - sstart)); + } + + // Sets the record id + void set_record_id(int slot, uint64_t value) { + ham_assert(m_store_raw_id ? 1 : value % m_page_size == 0); + m_data[slot] = m_store_raw_id ? value : value / m_page_size; + } + + // Returns the record id + uint64_t get_record_id(int slot, + int duplicate_index = 0) const { + ham_assert(duplicate_index == 0); + return (m_store_raw_id ? m_data[slot] : m_page_size * m_data[slot]); + } + + // Returns true if there's not enough space for another record + bool requires_split(size_t node_count) const { + return ((node_count + 1) * sizeof(uint64_t) >= m_range_size); + } + + // Change the capacity; for PAX layouts this just means copying the + // data from one place to the other + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t capacity_hint) { + if ((uint64_t *)new_data_ptr != m_data) { + memmove(new_data_ptr, m_data, node_count * sizeof(uint64_t)); + m_data = (uint64_t *)new_data_ptr; + } + m_range_size = new_range_size; + } + + // Fills the btree_metrics structure + void fill_metrics(btree_metrics_t *metrics, size_t node_count) { + BaseRecordList::fill_metrics(metrics, node_count); + BtreeStatistics::update_min_max_avg(&metrics->recordlist_unused, + m_range_size - get_required_range_size(node_count)); + } + + // Prints a slot to |out| (for debugging) + void print(Context *context, int slot, std::stringstream &out) const { + out << "(" << get_record_id(slot); + } + + private: + // The parent database of this btree + LocalDatabase *m_db; + + // The record data is an array of page IDs + uint64_t *m_data; + + // The page size + size_t m_page_size; + + // Store page ID % page size or the raw page ID? + bool m_store_raw_id; +}; + +} // namespace PaxLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_RECORDS_INTERNAL_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_stats.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_stats.cc new file mode 100644 index 0000000000..edd8c7b7a1 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_stats.cc @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> +#include <stdio.h> + +// Always verify that a file of level N does not include headers > N! +#include "2page/page.h" +#include "3btree/btree_stats.h" +#include "3btree/btree_index.h" +#include "3btree/btree_node_proxy.h" +#include "4db/db_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +BtreeStatistics::BtreeStatistics() + : m_append_count(0), m_prepend_count(0) +{ + memset(&m_last_leaf_pages[0], 0, sizeof(m_last_leaf_pages)); + memset(&m_last_leaf_count[0], 0, sizeof(m_last_leaf_count)); + memset(&m_keylist_range_size[0], 0, sizeof(m_keylist_range_size)); + memset(&m_keylist_capacities[0], 0, sizeof(m_keylist_capacities)); +} + +void +BtreeStatistics::find_succeeded(Page *page) +{ + uint64_t old = m_last_leaf_pages[kOperationFind]; + if (old != page->get_address()) { + m_last_leaf_pages[kOperationFind] = 0; + m_last_leaf_count[kOperationFind] = 0; + } + else + m_last_leaf_count[kOperationFind]++; +} + +void +BtreeStatistics::find_failed() +{ + m_last_leaf_pages[kOperationFind] = 0; + m_last_leaf_count[kOperationFind] = 0; +} + +void +BtreeStatistics::insert_succeeded(Page *page, uint16_t slot) +{ + uint64_t old = m_last_leaf_pages[kOperationInsert]; + if (old != page->get_address()) { + m_last_leaf_pages[kOperationInsert] = page->get_address(); + m_last_leaf_count[kOperationInsert] = 0; + } + else + m_last_leaf_count[kOperationInsert]++; + + BtreeNodeProxy *node; + node = page->get_db()->btree_index()->get_node_from_page(page); + ham_assert(node->is_leaf()); + + if (!node->get_right() && slot == node->get_count() - 1) + m_append_count++; + else + m_append_count = 0; + + if (!node->get_left() && slot == 0) + m_prepend_count++; + else + m_prepend_count = 0; +} + +void +BtreeStatistics::insert_failed() +{ + m_last_leaf_pages[kOperationInsert] = 0; + m_last_leaf_count[kOperationInsert] = 0; + m_append_count = 0; + m_prepend_count = 0; +} + +void +BtreeStatistics::erase_succeeded(Page *page) +{ + uint64_t old = m_last_leaf_pages[kOperationErase]; + if (old != page->get_address()) { + m_last_leaf_pages[kOperationErase] = page->get_address(); + m_last_leaf_count[kOperationErase] = 0; + } + else + m_last_leaf_count[kOperationErase]++; +} + +void +BtreeStatistics::erase_failed() +{ + m_last_leaf_pages[kOperationErase] = 0; + m_last_leaf_count[kOperationErase] = 0; +} + +void +BtreeStatistics::reset_page(Page *page) +{ + for (int i = 0; i < kOperationMax; i++) { + m_last_leaf_pages[i] = 0; + m_last_leaf_count[i] = 0; + } +} + +BtreeStatistics::FindHints +BtreeStatistics::get_find_hints(uint32_t flags) +{ + BtreeStatistics::FindHints hints = {flags, flags, 0, false}; + + /* if the last 5 lookups hit the same page: reuse that page */ + if (m_last_leaf_count[kOperationFind] >= 5) { + hints.try_fast_track = true; + hints.leaf_page_addr = m_last_leaf_pages[kOperationFind]; + } + + return (hints); +} + +BtreeStatistics::InsertHints +BtreeStatistics::get_insert_hints(uint32_t flags) +{ + InsertHints hints = {flags, flags, 0, 0, 0, 0, 0}; + + /* if the previous insert-operation replaced the upper bound (or + * lower bound) key then it was actually an append (or prepend) operation. + * in this case there's some probability that the next operation is also + * appending/prepending. + */ + if (m_append_count > 0) + hints.flags |= HAM_HINT_APPEND; + else if (m_prepend_count > 0) + hints.flags |= HAM_HINT_PREPEND; + + hints.append_count = m_append_count; + hints.prepend_count = m_prepend_count; + + /* if the last 5 inserts hit the same page: reuse that page */ + if (m_last_leaf_count[kOperationInsert] >= 5) + hints.leaf_page_addr = m_last_leaf_pages[kOperationInsert]; + + return (hints); +} + +#define AVG(m) m._instances ? (m._total / m._instances) : 0 + +void +BtreeStatistics::finalize_metrics(btree_metrics_t *metrics) +{ + metrics->keys_per_page.avg = AVG(metrics->keys_per_page); + metrics->keylist_ranges.avg = AVG(metrics->keylist_ranges); + metrics->recordlist_ranges.avg = AVG(metrics->recordlist_ranges); + metrics->keylist_index.avg = AVG(metrics->keylist_index); + metrics->recordlist_index.avg = AVG(metrics->recordlist_index); + metrics->keylist_unused.avg = AVG(metrics->keylist_unused); + metrics->recordlist_unused.avg = AVG(metrics->recordlist_unused); + metrics->keylist_blocks_per_page.avg = AVG(metrics->keylist_blocks_per_page); + metrics->keylist_block_sizes.avg = AVG(metrics->keylist_block_sizes); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_stats.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_stats.h new file mode 100644 index 0000000000..66c3f21ab9 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_stats.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * btree find/insert/erase statistical structures, functions and macros + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_STATS_H +#define HAM_BTREE_STATS_H + +#include "0root/root.h" + +#include <limits> + +#include "ham/hamsterdb_int.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Page; + +class BtreeStatistics { + public: + // Indices into find/insert/erase specific statistics + enum { + kOperationFind = 0, + kOperationInsert = 1, + kOperationErase = 2, + kOperationMax = 3 + }; + + struct FindHints { + // the original flags of ham_find + uint32_t original_flags; + + // the modified flags + uint32_t flags; + + // page/btree leaf to check first + uint64_t leaf_page_addr; + + // check specified btree leaf node page first + bool try_fast_track; + }; + + struct InsertHints { + // the original flags of ham_insert + uint32_t original_flags; + + // the modified flags + uint32_t flags; + + // page/btree leaf to check first + uint64_t leaf_page_addr; + + // the processed leaf page + Page *processed_leaf_page; + + // the slot in that page + uint16_t processed_slot; + + // count the number of appends + size_t append_count; + + // count the number of prepends + size_t prepend_count; + }; + + // Constructor + BtreeStatistics(); + + // Returns the btree hints for ham_find + FindHints get_find_hints(uint32_t flags); + + // Returns the btree hints for insert + InsertHints get_insert_hints(uint32_t flags); + + // Reports that a ham_find/ham_cusor_find succeeded + void find_succeeded(Page *page); + + // Reports that a ham_find/ham_cursor_find failed + void find_failed(); + + // Reports that a ham_insert/ham_cursor_insert succeeded + void insert_succeeded(Page *page, uint16_t slot); + + // Reports that a ham_insert/ham_cursor_insert failed + void insert_failed(); + + // Reports that a ham_erase/ham_cusor_erase succeeded + void erase_succeeded(Page *page); + + // Reports that a ham_erase/ham_cursor_erase failed + void erase_failed(); + + // Resets the statistics for a single page + void reset_page(Page *page); + + // Keep track of the KeyList range size + void set_keylist_range_size(bool leaf, size_t size) { + m_keylist_range_size[(int)leaf] = size; + } + + // Retrieves the KeyList range size + size_t get_keylist_range_size(bool leaf) const { + return (m_keylist_range_size[(int)leaf]); + } + + // Keep track of the KeyList capacities + void set_keylist_capacities(bool leaf, size_t capacity) { + m_keylist_capacities[(int)leaf] = capacity; + } + + // Retrieves the KeyList capacities size + size_t get_keylist_capacities(bool leaf) const { + return (m_keylist_capacities[(int)leaf]); + } + + // Calculate the "average" values + static void finalize_metrics(btree_metrics_t *metrics); + + // Update a min_max_avg structure + static void update_min_max_avg(min_max_avg_u32_t *data, uint32_t value) { + // first update? then perform initialization + if (data->_instances == 0) + data->min = std::numeric_limits<uint32_t>::max(); + + if (data->min > value) + data->min = value; + if (data->max < value) + data->max = value; + data->_total += value; + data->_instances++; + } + + private: + // last leaf page for find/insert/erase + uint64_t m_last_leaf_pages[kOperationMax]; + + // count of how often this leaf page was used + size_t m_last_leaf_count[kOperationMax]; + + // count the number of appends + size_t m_append_count; + + // count the number of prepends + size_t m_prepend_count; + + // the range size of the KeyList + size_t m_keylist_range_size[2]; + + // the capacities of the KeyList + size_t m_keylist_capacities[2]; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_STATS_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_update.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_update.cc new file mode 100644 index 0000000000..07d6cf61d4 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_update.cc @@ -0,0 +1,436 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "3page_manager/page_manager.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_stats.h" +#include "3btree/btree_index.h" +#include "3btree/btree_update.h" +#include "3btree/btree_node_proxy.h" +#include "4cursor/cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* a unittest hook triggered when a page is split */ +void (*g_BTREE_INSERT_SPLIT_HOOK)(void); + +// Traverses the tree, looking for the leaf with the specified |key|. Will +// split or merge nodes while descending. +// Returns the leaf page and the |parent| of the leaf (can be null if +// there is no parent). +Page * +BtreeUpdateAction::traverse_tree(const ham_key_t *key, + BtreeStatistics::InsertHints &hints, + Page **parent) +{ + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + + Page *page = env->page_manager()->fetch(m_context, + m_btree->get_root_address()); + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + *parent = 0; + + // if the root page is empty with children then collapse it + if (node->get_count() == 0 && !node->is_leaf()) { + page = collapse_root(page); + node = m_btree->get_node_from_page(page); + } + + int slot; + + // now walk down the tree + while (!node->is_leaf()) { + // is a split required? + if (node->requires_split(m_context)) { + page = split_page(page, *parent, key, hints); + node = m_btree->get_node_from_page(page); + } + + // get the child page + Page *sib_page = 0; + Page *child_page = m_btree->find_child(m_context, page, key, 0, &slot); + BtreeNodeProxy *child_node = m_btree->get_node_from_page(child_page); + + // We can merge this child with the RIGHT sibling iff... + // 1. it's not the right-most slot (and therefore the right sibling has + // the same parent as the child) + // 2. the child is a leaf! + // 3. it's empty or has too few elements + // 4. its right sibling is also empty + if (slot < (int)node->get_count() - 1 + && child_node->is_leaf() + && child_node->requires_merge() + && child_node->get_right() != 0) { + sib_page = env->page_manager()->fetch(m_context, + child_node->get_right(), + PageManager::kOnlyFromCache); + if (sib_page != 0) { + BtreeNodeProxy *sib_node = m_btree->get_node_from_page(sib_page); + if (sib_node->requires_merge()) { + merge_page(child_page, sib_page); + // also remove the link to the sibling from the parent + node->erase(m_context, slot + 1); + page->set_dirty(true); + } + } + } + + // We can also merge this child with the LEFT sibling iff... + // 1. it's not the left-most slot + // 2. the child is a leaf! + // 3. it's empty or has too few elements + // 4. its left sibling is also empty + else if (slot > 0 + && child_node->is_leaf() + && child_node->requires_merge() + && child_node->get_left() != 0) { + sib_page = env->page_manager()->fetch(m_context, + child_node->get_left(), + PageManager::kOnlyFromCache); + if (sib_page != 0) { + BtreeNodeProxy *sib_node = m_btree->get_node_from_page(sib_page); + if (sib_node->requires_merge()) { + merge_page(sib_page, child_page); + // also remove the link to the sibling from the parent + node->erase(m_context, slot); + page->set_dirty(true); + // continue traversal with the sibling + child_page = sib_page; + child_node = sib_node; + } + } + } + + *parent = page; + + // go down one level in the tree + page = child_page; + node = child_node; + } + + return (page); +} + +Page * +BtreeUpdateAction::merge_page(Page *page, Page *sibling) +{ + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + BtreeNodeProxy *sib_node = m_btree->get_node_from_page(sibling); + + if (sib_node->is_leaf()) + BtreeCursor::uncouple_all_cursors(m_context, sibling, 0); + + node->merge_from(m_context, sib_node); + page->set_dirty(true); + + // fix the linked list + node->set_right(sib_node->get_right()); + if (node->get_right()) { + Page *new_right = env->page_manager()->fetch(m_context, node->get_right()); + BtreeNodeProxy *new_right_node = m_btree->get_node_from_page(new_right); + new_right_node->set_left(page->get_address()); + new_right->set_dirty(true); + } + + m_btree->get_statistics()->reset_page(sibling); + m_btree->get_statistics()->reset_page(page); + env->page_manager()->del(m_context, sibling); + + BtreeIndex::ms_btree_smo_merge++; + return (page); +} + +Page * +BtreeUpdateAction::collapse_root(Page *root_page) +{ + LocalEnvironment *env = root_page->get_db()->lenv(); + BtreeNodeProxy *node = m_btree->get_node_from_page(root_page); + ham_assert(node->get_count() == 0); + + m_btree->get_statistics()->reset_page(root_page); + m_btree->set_root_address(m_context, node->get_ptr_down()); + Page *header = env->page_manager()->fetch(m_context, 0); + header->set_dirty(true); + + Page *new_root = env->page_manager()->fetch(m_context, + m_btree->get_root_address()); + new_root->set_type(Page::kTypeBroot); + env->page_manager()->del(m_context, root_page); + return (new_root); +} + +Page * +BtreeUpdateAction::split_page(Page *old_page, Page *parent, + const ham_key_t *key, + BtreeStatistics::InsertHints &hints) +{ + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + + m_btree->get_statistics()->reset_page(old_page); + BtreeNodeProxy *old_node = m_btree->get_node_from_page(old_page); + + /* allocate a new page and initialize it */ + Page *new_page = env->page_manager()->alloc(m_context, Page::kTypeBindex); + { + PBtreeNode *node = PBtreeNode::from_page(new_page); + node->set_flags(old_node->is_leaf() ? PBtreeNode::kLeafNode : 0); + } + BtreeNodeProxy *new_node = m_btree->get_node_from_page(new_page); + + /* no parent page? then we're splitting the root page. allocate + * a new root page */ + if (!parent) + parent = allocate_new_root(old_page); + + Page *to_return = 0; + ByteArray pivot_key_arena; + ham_key_t pivot_key = {0}; + + /* if the key is appended then don't split the page; simply allocate + * a new page and insert the new key. */ + int pivot = 0; + if (hints.flags & HAM_HINT_APPEND && old_node->is_leaf()) { + int cmp = old_node->compare(m_context, key, old_node->get_count() - 1); + if (cmp == +1) { + to_return = new_page; + pivot_key = *key; + pivot = old_node->get_count(); + } + } + + /* no append? then calculate the pivot key and perform the split */ + if (pivot != (int)old_node->get_count()) { + pivot = get_pivot(old_node, key, hints); + + /* and store the pivot key for later */ + old_node->get_key(m_context, pivot, &pivot_key_arena, &pivot_key); + + /* leaf page: uncouple all cursors */ + if (old_node->is_leaf()) + BtreeCursor::uncouple_all_cursors(m_context, old_page, pivot); + /* internal page: fix the ptr_down of the new page + * (it must point to the ptr of the pivot key) */ + else + new_node->set_ptr_down(old_node->get_record_id(m_context, pivot)); + + /* now move some of the key/rid-tuples to the new page */ + old_node->split(m_context, new_node, pivot); + + // if the new key is >= the pivot key then continue with the right page, + // otherwise continue with the left page + to_return = m_btree->compare_keys((ham_key_t *)key, &pivot_key) >= 0 + ? new_page + : old_page; + } + + /* update the parent page */ + BtreeNodeProxy *parent_node = m_btree->get_node_from_page(parent); + uint64_t rid = new_page->get_address(); + ham_record_t record = ham_make_record(&rid, sizeof(rid)); + ham_status_t st = insert_in_page(parent, &pivot_key, &record, hints); + if (st) + throw Exception(st); + /* new root page? then also set ptr_down! */ + if (parent_node->get_count() == 0) + parent_node->set_ptr_down(old_page->get_address()); + + /* fix the double-linked list of pages, and mark the pages as dirty */ + if (old_node->get_right()) { + Page *sib_page = env->page_manager()->fetch(m_context, + old_node->get_right()); + BtreeNodeProxy *sib_node = m_btree->get_node_from_page(sib_page); + sib_node->set_left(new_page->get_address()); + sib_page->set_dirty(true); + } + new_node->set_left(old_page->get_address()); + new_node->set_right(old_node->get_right()); + old_node->set_right(new_page->get_address()); + new_page->set_dirty(true); + old_page->set_dirty(true); + + BtreeIndex::ms_btree_smo_split++; + + if (g_BTREE_INSERT_SPLIT_HOOK) + g_BTREE_INSERT_SPLIT_HOOK(); + + return (to_return); +} + +Page * +BtreeUpdateAction::allocate_new_root(Page *old_root) +{ + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + + Page *new_root = env->page_manager()->alloc(m_context, Page::kTypeBroot); + + /* insert the pivot element and set ptr_down */ + BtreeNodeProxy *new_node = m_btree->get_node_from_page(new_root); + new_node->set_ptr_down(old_root->get_address()); + + m_btree->set_root_address(m_context, new_root->get_address()); + Page *header = env->page_manager()->fetch(m_context, 0); + header->set_dirty(true); + + old_root->set_type(Page::kTypeBindex); + + return (new_root); +} + +int +BtreeUpdateAction::get_pivot(BtreeNodeProxy *old_node, const ham_key_t *key, + BtreeStatistics::InsertHints &hints) const +{ + uint32_t old_count = old_node->get_count(); + ham_assert(old_count > 2); + + bool pivot_at_end = false; + if (hints.flags & HAM_HINT_APPEND && hints.append_count > 5) + pivot_at_end = true; + else if (old_node->get_right() == 0) { + int cmp = old_node->compare(m_context, key, old_node->get_count() - 1); + if (cmp > 0) + pivot_at_end = true; + } + + /* The position of the pivot key depends on the previous inserts; if most + * of them were appends then pick a pivot key at the "end" of the node */ + int pivot; + if (pivot_at_end || hints.append_count > 30) + pivot = old_count - 2; + else if (hints.append_count > 10) + pivot = (int)(old_count / 100.f * 66); + else if (hints.prepend_count > 10) + pivot = (int)(old_count / 100.f * 33); + else if (hints.prepend_count > 30) + pivot = 2; + else + pivot = old_count / 2; + + ham_assert(pivot > 0 && pivot <= (int)old_count - 2); + + return (pivot); +} + +ham_status_t +BtreeUpdateAction::insert_in_page(Page *page, ham_key_t *key, + ham_record_t *record, + BtreeStatistics::InsertHints &hints, + bool force_prepend, bool force_append) +{ + bool exists = false; + + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + + int flags = 0; + if (force_prepend) + flags |= PBtreeNode::kInsertPrepend; + if (force_append) + flags |= PBtreeNode::kInsertAppend; + + PBtreeNode::InsertResult result = node->insert(m_context, key, flags); + switch (result.status) { + case HAM_DUPLICATE_KEY: + if (hints.flags & HAM_OVERWRITE) { + /* key already exists; only overwrite the data */ + if (!node->is_leaf()) + return (HAM_SUCCESS); + } + else if (!(hints.flags & HAM_DUPLICATE)) + return (HAM_DUPLICATE_KEY); + /* do NOT shift keys up to make room; just overwrite the + * current [slot] */ + exists = true; + break; + case HAM_SUCCESS: + break; + default: + return (result.status); + } + + uint32_t new_duplicate_id = 0; + if (exists) { + if (node->is_leaf()) { + // overwrite record blob + node->set_record(m_context, result.slot, record, m_duplicate_index, + hints.flags, &new_duplicate_id); + + hints.processed_leaf_page = page; + hints.processed_slot = result.slot; + } + else { + // overwrite record id + ham_assert(record->size == sizeof(uint64_t)); + node->set_record_id(m_context, result.slot, *(uint64_t *)record->data); + } + } + // key does not exist and has to be inserted or appended + else { + try { + if (node->is_leaf()) { + // allocate record id + node->set_record(m_context, result.slot, record, m_duplicate_index, + hints.flags, &new_duplicate_id); + + hints.processed_leaf_page = page; + hints.processed_slot = result.slot; + } + else { + // set the internal record id + ham_assert(record->size == sizeof(uint64_t)); + node->set_record_id(m_context, result.slot, *(uint64_t *)record->data); + } + } + // In case of an error: undo the insert. This happens very rarely but + // it's possible, i.e. if the BlobManager fails to allocate storage. + catch (Exception &ex) { + if (result.slot < (int)node->get_count()) + node->erase(m_context, result.slot); + throw ex; + } + } + + page->set_dirty(true); + + // if this update was triggered with a cursor (and this is a leaf node): + // couple it to the inserted key + // TODO only when performing an insert(), not an erase()! + if (m_cursor && node->is_leaf()) { + m_cursor->get_parent()->set_to_nil(Cursor::kBtree); + ham_assert(m_cursor->get_state() == BtreeCursor::kStateNil); + m_cursor->couple_to_page(page, result.slot, new_duplicate_id); + } + + return (HAM_SUCCESS); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_update.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_update.h new file mode 100644 index 0000000000..51176980fe --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_update.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_BTREE_UPDATE_H +#define HAM_BTREE_UPDATE_H + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class BtreeIndex; +class BtreeCursor; + +/* + * Base class for updates; derived for erasing and inserting keys. + */ +class BtreeUpdateAction +{ + public: + // Constructor + BtreeUpdateAction(BtreeIndex *btree, Context *context, BtreeCursor *cursor, + uint32_t duplicate_index) + : m_btree(btree), m_context(context), m_cursor(cursor), + m_duplicate_index(duplicate_index) { + } + + // Traverses the tree, looking for the leaf with the specified |key|. Will + // split or merge nodes while descending. + // Returns the leaf page and the |parent| of the leaf (can be null if + // there is no parent). + Page *traverse_tree(const ham_key_t *key, + BtreeStatistics::InsertHints &hints, Page **parent); + + // Calculates the pivot index of a split. + // + // For databases with sequential access (this includes recno databases): + // do not split in the middle, but at the very end of the page. + // + // If this page is the right-most page in the index, and the new key is + // inserted at the very end, then we select the same pivot as for + // sequential access. + int get_pivot(BtreeNodeProxy *old_node, const ham_key_t *key, + BtreeStatistics::InsertHints &hints) const; + + // Splits |page| and updates the |parent|. If |parent| is null then + // it's assumed that |page| is the root node. + // Returns the new page in the path for |key|; caller can immediately + // continue the traversal. + Page *split_page(Page *old_page, Page *parent, const ham_key_t *key, + BtreeStatistics::InsertHints &hints); + + // Allocates a new root page and sets it up in the btree + Page *allocate_new_root(Page *old_root); + + // Inserts a key in a page + ham_status_t insert_in_page(Page *page, ham_key_t *key, + ham_record_t *record, + BtreeStatistics::InsertHints &hints, + bool force_prepend = false, bool force_append = false); + + protected: + // the current btree + BtreeIndex *m_btree; + + // The caller's Context + Context *m_context; + + // the current cursor + BtreeCursor *m_cursor; + + // the duplicate index (in case the update is for a duplicate key) + // 1-based (if 0 then this update is not for a duplicate) + uint32_t m_duplicate_index; + + private: + /* Merges the |sibling| into |page|, returns the merged page and moves + * the sibling to the freelist */ + Page *merge_page(Page *page, Page *sibling); + + /* collapse the root node; returns the new root */ + Page *collapse_root(Page *root_page); +}; + +} // namespace hamsterdb + +#endif // HAM_BTREE_UPDATE_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_visit.cc b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_visit.cc new file mode 100644 index 0000000000..05cd2603e5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_visit.cc @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * btree enumeration; visits each node + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3page_manager/page_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_node_proxy.h" +#include "3btree/btree_visitor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class BtreeVisitAction +{ + public: + BtreeVisitAction(BtreeIndex *btree, Context *context, BtreeVisitor &visitor, + bool visit_internal_nodes) + : m_btree(btree), m_context(context), m_visitor(visitor), + m_visit_internal_nodes(visit_internal_nodes) { + ham_assert(m_btree->get_root_address() != 0); + } + + void run() { + LocalDatabase *db = m_btree->get_db(); + LocalEnvironment *env = db->lenv(); + + uint32_t pm_flags = 0; + if (m_visitor.is_read_only()) + pm_flags = PageManager::kReadOnly; + + // get the root page of the tree + Page *page = env->page_manager()->fetch(m_context, + m_btree->get_root_address(), pm_flags); + + // go down to the leaf + while (page) { + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + uint64_t ptr_down = node->get_ptr_down(); + + // visit internal nodes as well? + if (ptr_down != 0 && m_visit_internal_nodes) { + while (page) { + node = m_btree->get_node_from_page(page); + m_visitor(m_context, node); + + // load the right sibling + uint64_t right = node->get_right(); + if (right) + page = env->page_manager()->fetch(m_context, right, pm_flags); + else + page = 0; + } + } + + // follow the pointer to the smallest child + if (ptr_down) + page = env->page_manager()->fetch(m_context, ptr_down, pm_flags); + else + break; + } + + ham_assert(page != 0); + + // now visit all leaf nodes + while (page) { + BtreeNodeProxy *node = m_btree->get_node_from_page(page); + uint64_t right = node->get_right(); + + m_visitor(m_context, node); + + /* follow the pointer to the right sibling */ + if (right) + page = env->page_manager()->fetch(m_context, right, pm_flags); + else + break; + } + } + + private: + BtreeIndex *m_btree; + Context *m_context; + BtreeVisitor &m_visitor; + bool m_visit_internal_nodes; +}; + +void +BtreeIndex::visit_nodes(Context *context, BtreeVisitor &visitor, + bool visit_internal_nodes) +{ + BtreeVisitAction bva(this, context, visitor, visit_internal_nodes); + bva.run(); +} + +} // namespace hamsterdb + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_visitor.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_visitor.h new file mode 100644 index 0000000000..19770a9e70 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/btree_visitor.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_BTREE_VISITOR_H +#define HAM_BTREE_VISITOR_H + +#include "0root/root.h" + +#include "ham/hamsterdb_ola.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The ScanVisitor is the callback implementation for the scan call. +// It will either receive single keys or multiple keys in an array. +// +struct ScanVisitor { + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) = 0; + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) = 0; + + // Assigns the internal result to |result| + virtual void assign_result(hola_result_t *result) = 0; +}; + +struct Context; +class BtreeNodeProxy; + +// +// The BtreeVisitor is the callback implementation for the visit call. +// It will visit each node instead of each key. +// +struct BtreeVisitor { + // Specifies if the visitor modifies the node + virtual bool is_read_only() const = 0; + + // called for each node + virtual void operator()(Context *context, BtreeNodeProxy *node) = 0; +}; + +} // namespace hamsterdb + +#endif /* HAM_BTREE_VISITOR_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3btree/upfront_index.h b/plugins/Dbx_kv/src/hamsterdb/src/3btree/upfront_index.h new file mode 100644 index 0000000000..b8aad1396d --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3btree/upfront_index.h @@ -0,0 +1,684 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A small index which manages variable length buffers. Used to manage + * variable length keys or records. + * + * The UpfrontIndex manages a range of bytes, organized in variable length + * |chunks|, assigned at initialization time when calling |allocate()| + * or |open()|. + * + * These chunks are organized in |slots|, each slot stores the offset and + * the size of the chunk data. The offset is stored as 16- or 32-bit, depending + * on the page size. The size is always a 16bit integer. + * + * The number of used slots is not stored in the UpfrontIndex, since it is + * already managed in the caller (this is equal to |PBtreeNode::get_count()|). + * Therefore you will see a lot of methods receiving a |node_count| parameter. + * + * Deleted chunks are moved to a |freelist|, which is simply a list of slots + * directly following those slots that are in use. + * + * In addition, the UpfrontIndex keeps track of the unused space at the end + * of the range (via |get_next_offset()|), in order to allow a fast + * allocation of space. + * + * The UpfrontIndex stores metadata at the beginning: + * [0..3] freelist count + * [4..7] next offset + * [8..11] capacity + * + * Data is stored in the following layout: + * |metadata|slot1|slot2|...|slotN|free1|free2|...|freeM|data1|data2|...|dataN| + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_BTREE_UPFRONT_INDEX_H +#define HAM_BTREE_UPFRONT_INDEX_H + +#include "0root/root.h" + +#include <algorithm> +#include <vector> + +// Always verify that a file of level N does not include headers > N! +#include "1globals/globals.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +namespace DefLayout { + +/* + * A helper class to sort ranges; used during validation of the up-front + * index in check_index_integrity() + */ +struct SortHelper { + uint32_t offset; + int slot; + + bool operator<(const SortHelper &rhs) const { + return (offset < rhs.offset); + } +}; + +static bool +sort_by_offset(const SortHelper &lhs, const SortHelper &rhs) { + return (lhs.offset < rhs.offset); +} + +class UpfrontIndex +{ + enum { + // width of the 'size' field + kSizeofSize = 1 // 1 byte - max chunk size is 255 + }; + + public: + enum { + // for freelist_count, next_offset, capacity + kPayloadOffset = 12, + + // minimum capacity of the index + kMinimumCapacity = 16 + }; + + // Constructor; creates an empty index which needs to be initialized + // with |create()| or |open()|. + UpfrontIndex(LocalDatabase *db) + : m_data(0), m_range_size(0), m_vacuumize_counter(0) { + size_t page_size = db->lenv()->config().page_size_bytes; + if (page_size <= 64 * 1024) + m_sizeof_offset = 2; + else + m_sizeof_offset = 4; + } + + // Initialization routine; sets data pointer, range size and the + // initial capacity. + void create(uint8_t *data, size_t range_size, size_t capacity) { + m_data = data; + m_range_size = range_size; + set_capacity(capacity); + clear(); + } + + // "Opens" an existing index from memory. This method sets the data + // pointer and initializes itself. + void open(uint8_t *data, size_t range_size) { + m_data = data; + m_range_size = range_size; + // the vacuumize-counter is not persisted, therefore + // pretend that the counter is very high; in worst case this will cause + // an invalid call to vacuumize(), which is not a problem + if (get_freelist_count()) + m_vacuumize_counter = m_range_size; + } + + // Changes the range size and capacity of the index; used to resize the + // KeyList or RecordList + void change_range_size(size_t node_count, uint8_t *new_data_ptr, + size_t new_range_size, size_t new_capacity) { + if (!new_data_ptr) + new_data_ptr = m_data; + if (!new_range_size) + new_range_size = m_range_size; + + // get rid of the freelist and collect the garbage + if (get_freelist_count() > 0) + vacuumize(node_count); + ham_assert(get_freelist_count() == 0); + + size_t used_data_size = get_next_offset(node_count); + size_t old_capacity = get_capacity(); + uint8_t *src = &m_data[kPayloadOffset + + old_capacity * get_full_index_size()]; + uint8_t *dst = &new_data_ptr[kPayloadOffset + + new_capacity * get_full_index_size()]; + + // if old range == new range then leave + if (m_range_size == new_range_size + && old_capacity == new_capacity + && m_data == new_data_ptr ) + return; + + ham_assert(dst - new_data_ptr + used_data_size <= new_range_size); + + // shift "to the right"? Then first move the data and afterwards + // the index + if (dst > src) { + memmove(dst, src, used_data_size); + memmove(new_data_ptr, m_data, + kPayloadOffset + new_capacity * get_full_index_size()); + } + // vice versa otherwise + else if (dst <= src) { + if (new_data_ptr != m_data) + memmove(new_data_ptr, m_data, + kPayloadOffset + new_capacity * get_full_index_size()); + memmove(dst, src, used_data_size); + } + + m_data = new_data_ptr; + m_range_size = new_range_size; + set_capacity(new_capacity); + set_freelist_count(0); + set_next_offset(used_data_size); // has dependency to get_freelist_count() + } + + // Calculates the required size for a range + size_t get_required_range_size(size_t node_count) const { + return (UpfrontIndex::kPayloadOffset + + get_capacity() * get_full_index_size() + + get_next_offset(node_count)); + } + + // Returns the size of a single index entry + size_t get_full_index_size() const { + return (m_sizeof_offset + kSizeofSize); + } + + // Transforms a relative offset of the payload data to an absolute offset + // in |m_data| + uint32_t get_absolute_offset(uint32_t offset) const { + return (offset + + kPayloadOffset + + get_capacity() * get_full_index_size()); + } + + // Returns the absolute start offset of a chunk + uint32_t get_absolute_chunk_offset(int slot) const { + return (get_absolute_offset(get_chunk_offset(slot))); + } + + // Returns the relative start offset of a chunk + uint32_t get_chunk_offset(int slot) const { + uint8_t *p = &m_data[kPayloadOffset + get_full_index_size() * slot]; + if (m_sizeof_offset == 2) + return (*(uint16_t *)p); + else { + ham_assert(m_sizeof_offset == 4); + return (*(uint32_t *)p); + } + } + + // Returns the size of a chunk + uint16_t get_chunk_size(int slot) const { + return (m_data[kPayloadOffset + get_full_index_size() * slot + + m_sizeof_offset]); + } + + // Sets the size of a chunk (does NOT actually resize the chunk!) + void set_chunk_size(int slot, uint16_t size) { + ham_assert(size <= 255); + m_data[kPayloadOffset + get_full_index_size() * slot + m_sizeof_offset] + = (uint8_t)size; + } + + // Increases the "vacuumize-counter", which is an indicator whether + // rearranging the node makes sense + void increase_vacuumize_counter(size_t gap_size) { + m_vacuumize_counter += gap_size; + } + + // Vacuumizes the index, *if it makes sense*. Returns true if the + // operation was successful, otherwise false + bool maybe_vacuumize(size_t node_count) { + if (m_vacuumize_counter > 0 || get_freelist_count() > 0) { + vacuumize(node_count); + return (true); + } + return (false); + } + + // Returns true if this index has at least one free slot available. + // |node_count| is the number of used slots (this is managed by the caller) + bool can_insert(size_t node_count) { + return (likely(node_count + get_freelist_count() < get_capacity())); + } + + // Inserts a slot at the position |slot|. |node_count| is the number of + // used slots (this is managed by the caller) + void insert(size_t node_count, int slot) { + ham_assert(can_insert(node_count) == true); + + size_t slot_size = get_full_index_size(); + size_t total_count = node_count + get_freelist_count(); + uint8_t *p = &m_data[kPayloadOffset + slot_size * slot]; + if (total_count > 0 && slot < (int)total_count) { + // create a gap in the index + memmove(p + slot_size, p, slot_size * (total_count - slot)); + } + + // now fill the gap + memset(p, 0, slot_size); + } + + // Erases a slot at the position |slot| + // |node_count| is the number of used slots (this is managed by the caller) + void erase(size_t node_count, int slot) { + size_t slot_size = get_full_index_size(); + size_t total_count = node_count + get_freelist_count(); + + ham_assert(slot < (int)total_count); + + set_freelist_count(get_freelist_count() + 1); + + size_t chunk_size = get_chunk_size(slot); + + increase_vacuumize_counter(chunk_size); + + // nothing to do if we delete the very last (used) slot; the freelist + // counter was already incremented, the used counter is decremented + // by the caller + if (slot == (int)node_count - 1) + return; + + size_t chunk_offset = get_chunk_offset(slot); + + // shift all items to the left + uint8_t *p = &m_data[kPayloadOffset + slot_size * slot]; + memmove(p, p + slot_size, slot_size * (total_count - slot)); + + // then copy the deleted chunk to the freelist + set_chunk_offset(total_count - 1, chunk_offset); + set_chunk_size(total_count - 1, chunk_size); + } + + // Adds a chunk to the freelist. Will not do anything if the node + // is already full. + void add_to_freelist(size_t node_count, uint32_t chunk_offset, + uint32_t chunk_size) { + size_t total_count = node_count + get_freelist_count(); + if (likely(total_count < get_capacity())) { + set_freelist_count(get_freelist_count() + 1); + set_chunk_size(total_count, chunk_size); + set_chunk_offset(total_count, chunk_offset); + } + } + + // Returns true if this page has enough space to store at least |num_bytes| + // bytes. + bool can_allocate_space(size_t node_count, size_t num_bytes) { + // first check if we can append the data; this is the cheapest check, + // therefore it comes first + if (get_next_offset(node_count) + num_bytes <= get_usable_data_size()) + return (true); + + // otherwise check the freelist + uint32_t total_count = node_count + get_freelist_count(); + for (uint32_t i = node_count; i < total_count; i++) + if (get_chunk_size(i) >= num_bytes) + return (true); + return (false); + } + + // Allocates space for a |slot| and returns the offset of that chunk + uint32_t allocate_space(size_t node_count, int slot, + size_t num_bytes) { + ham_assert(can_allocate_space(node_count, num_bytes)); + + size_t next_offset = get_next_offset(node_count); + + // try to allocate space at the end of the node + if (next_offset + num_bytes <= get_usable_data_size()) { + uint32_t offset = get_chunk_offset(slot); + // if this slot's data is at the very end then maybe it can be + // resized without actually moving the data + if (unlikely(next_offset == offset + get_chunk_size(slot))) { + set_next_offset(offset + num_bytes); + set_chunk_size(slot, num_bytes); + return (offset); + } + set_next_offset(next_offset + num_bytes); + set_chunk_offset(slot, next_offset); + set_chunk_size(slot, num_bytes); + return (next_offset); + } + + size_t slot_size = get_full_index_size(); + + // otherwise check the freelist + uint32_t total_count = node_count + get_freelist_count(); + for (uint32_t i = node_count; i < total_count; i++) { + uint32_t chunk_size = get_chunk_size(i); + uint32_t chunk_offset = get_chunk_offset(i); + if (chunk_size >= num_bytes) { + // update next_offset? + if (unlikely(next_offset == chunk_offset + chunk_size)) + invalidate_next_offset(); + else if (unlikely(next_offset == get_chunk_offset(slot) + + get_chunk_size(slot))) + invalidate_next_offset(); + // copy the chunk to the new slot + set_chunk_size(slot, num_bytes); + set_chunk_offset(slot, chunk_offset); + // remove from the freelist + if (i < total_count - 1) { + uint8_t *p = &m_data[kPayloadOffset + slot_size * i]; + memmove(p, p + slot_size, slot_size * (total_count - i - 1)); + } + set_freelist_count(get_freelist_count() - 1); + return (get_chunk_offset(slot)); + } + } + + ham_assert(!"shouldn't be here"); + throw Exception(HAM_INTERNAL_ERROR); + } + + // Returns true if |key| cannot be inserted because a split is required. + // Unlike implied by the name, this function will try to re-arrange the + // node in order for the key to fit in. + bool requires_split(size_t node_count, size_t required_size) { + return (!can_insert(node_count) + || !can_allocate_space(node_count, required_size)); + } + + // Verifies that there are no overlapping chunks + void check_integrity(size_t node_count) const { + typedef std::pair<uint32_t, uint32_t> Range; + //typedef std::vector<Range> RangeVec; + uint32_t total_count = node_count + get_freelist_count(); + + ham_assert(node_count > 1 + ? get_next_offset(node_count) > 0 + : true); + + if (total_count > get_capacity()) { + ham_trace(("integrity violated: total count %u (%u+%u) > capacity %u", + total_count, node_count, get_freelist_count(), + get_capacity())); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + + //RangeVec ranges; + //ranges.reserve(total_count); + uint32_t next_offset = 0; + for (uint32_t i = 0; i < total_count; i++) { + Range range = std::make_pair(get_chunk_offset(i), get_chunk_size(i)); + uint32_t next = range.first + range.second; + if (next >= next_offset) + next_offset = next; + //ranges.push_back(range); + } + +#if 0 + std::sort(ranges.begin(), ranges.end()); + + if (!ranges.empty()) { + for (uint32_t i = 0; i < ranges.size() - 1; i++) { + if (ranges[i].first + ranges[i].second > ranges[i + 1].first) { + ham_trace(("integrity violated: slot %u/%u overlaps with %lu", + ranges[i].first, ranges[i].second, + ranges[i + 1].first)); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + } +#endif + + if (next_offset != get_next_offset(node_count)) { + ham_trace(("integrity violated: next offset %d, cached offset %d", + next_offset, get_next_offset(node_count))); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + if (next_offset != calc_next_offset(node_count)) { + ham_trace(("integrity violated: next offset %d, calculated offset %d", + next_offset, calc_next_offset(node_count))); + throw Exception(HAM_INTEGRITY_VIOLATED); + } + } + + // Splits an index and moves all chunks starting from position |pivot| + // to the other index. + // The other index *must* be empty! + void split(UpfrontIndex *other, size_t node_count, int pivot) { + other->clear(); + + // now copy key by key + for (size_t i = pivot; i < node_count; i++) { + other->insert(i - pivot, i - pivot); + uint32_t size = get_chunk_size(i); + uint32_t offset = other->allocate_space(i - pivot, i - pivot, size); + memcpy(other->get_chunk_data_by_offset(offset), + get_chunk_data_by_offset(get_chunk_offset(i)), + size); + } + + // this node has lost lots of its data - make sure that it will be + // vacuumized as soon as more data is allocated + m_vacuumize_counter += node_count; + set_freelist_count(0); + set_next_offset((uint32_t)-1); + } + + // Merges all chunks from the |other| index to this index + void merge_from(UpfrontIndex *other, size_t node_count, + size_t other_node_count) { + vacuumize(node_count); + + for (size_t i = 0; i < other_node_count; i++) { + insert(i + node_count, i + node_count); + uint32_t size = other->get_chunk_size(i); + uint32_t offset = allocate_space(i + node_count, i + node_count, size); + memcpy(get_chunk_data_by_offset(offset), + other->get_chunk_data_by_offset(other->get_chunk_offset(i)), + size); + } + + other->clear(); + } + + // Returns a pointer to the actual data of a chunk + uint8_t *get_chunk_data_by_offset(uint32_t offset) { + return (&m_data[kPayloadOffset + + get_capacity() * get_full_index_size() + + offset]); + } + + // Returns a pointer to the actual data of a chunk + uint8_t *get_chunk_data_by_offset(uint32_t offset) const { + return (&m_data[kPayloadOffset + + get_capacity() * get_full_index_size() + + offset]); + } + + // Reduces the capacity of the UpfrontIndex, if required + void reduce_capacity(size_t node_count) { + size_t old_capacity = get_capacity(); + if (node_count > 0 && old_capacity > node_count + 4) { + size_t new_capacity = old_capacity - (old_capacity - node_count) / 2; + if (new_capacity != old_capacity) + change_range_size(node_count, m_data, m_range_size, new_capacity); + } + } + + // Re-arranges the node: moves all keys sequentially to the beginning + // of the key space, removes the whole freelist. + // + // This call is extremely expensive! Try to avoid it as much as possible. + void vacuumize(size_t node_count) { + if (m_vacuumize_counter < 10) { + if (get_freelist_count() > 0) { + set_freelist_count(0); + invalidate_next_offset(); + } + return; + } + + // get rid of the freelist - this node is now completely rewritten, + // and the freelist would just complicate things + set_freelist_count(0); + + // make a copy of all indices (excluding the freelist) + bool requires_sort = false; + SortHelper *s = (SortHelper *)::alloca(node_count * sizeof(SortHelper)); + for (size_t i = 0; i < node_count; i++) { + s[i].slot = i; + s[i].offset = get_chunk_offset(i); + if (i > 0 && s[i].offset < s[i - 1].offset) + requires_sort = true; + } + + // sort them by offset; this is a very expensive call. only sort if + // it's absolutely necessary! + if (requires_sort) + std::sort(&s[0], &s[node_count], sort_by_offset); + + // shift all keys to the left, get rid of all gaps at the front of the + // key data or between the keys + uint32_t next_offset = 0; + uint32_t start = kPayloadOffset + get_capacity() * get_full_index_size(); + for (size_t i = 0; i < node_count; i++) { + uint32_t offset = s[i].offset; + int slot = s[i].slot; + uint32_t size = get_chunk_size(slot); + if (offset != next_offset) { + // shift key to the left + memmove(&m_data[start + next_offset], + get_chunk_data_by_offset(offset), size); + // store the new offset + set_chunk_offset(slot, next_offset); + } + next_offset += size; + } + + set_next_offset(next_offset); + m_vacuumize_counter = 0; + } + + // Invalidates the cached "next offset". In some cases it's necessary + // that the caller forces a re-evaluation of the next offset. Although + // i *think* that this method could become private, but the effort + // is not worth the gain. + void invalidate_next_offset() { + set_next_offset((uint32_t)-1); + } + + // Same as above, but only if the next_offset equals |new_offset| + void maybe_invalidate_next_offset(size_t new_offset) { + if (get_next_offset(0) == new_offset) + invalidate_next_offset(); + } + + // Returns the capacity + size_t get_capacity() const { + return (*(uint32_t *)(m_data + 8)); + } + + // Returns the offset of the unused space at the end of the page + uint32_t get_next_offset(size_t node_count) { + uint32_t ret = *(uint32_t *)(m_data + 4); + if (unlikely(ret == (uint32_t)-1 && node_count > 0)) { + ret = calc_next_offset(node_count); + set_next_offset(ret); + } + return (ret); + } + + private: + friend class UpfrontIndexFixture; + + // Resets the page + void clear() { + set_freelist_count(0); + set_next_offset(0); + m_vacuumize_counter = 0; + } + + // Returns the offset of the unused space at the end of the page + // (const version) + uint32_t get_next_offset(size_t node_count) const { + uint32_t ret = *(uint32_t *)(m_data + 4); + if (unlikely(ret == (uint32_t)-1)) + return (calc_next_offset(node_count)); + return (ret); + } + + // Returns the size (in bytes) where payload data can be stored + size_t get_usable_data_size() const { + return (m_range_size - kPayloadOffset + - get_capacity() * get_full_index_size()); + } + + // Sets the chunk offset of a slot + void set_chunk_offset(int slot, uint32_t offset) { + uint8_t *p = &m_data[kPayloadOffset + get_full_index_size() * slot]; + if (m_sizeof_offset == 2) + *(uint16_t *)p = (uint16_t)offset; + else + *(uint32_t *)p = offset; + } + + // Returns the number of freelist entries + size_t get_freelist_count() const { + return (*(uint32_t *)m_data); + } + + // Sets the number of freelist entries + void set_freelist_count(size_t freelist_count) { + ham_assert(freelist_count <= get_capacity()); + *(uint32_t *)m_data = freelist_count; + } + + // Calculates and returns the next offset; does not store it + uint32_t calc_next_offset(size_t node_count) const { + uint32_t total_count = node_count + get_freelist_count(); + uint32_t next_offset = 0; + for (uint32_t i = 0; i < total_count; i++) { + uint32_t next = get_chunk_offset(i) + get_chunk_size(i); + if (next >= next_offset) + next_offset = next; + } + return (next_offset); + } + + // Sets the offset of the unused space at the end of the page + void set_next_offset(uint32_t next_offset) { + *(uint32_t *)(m_data + 4) = next_offset; + } + + // Sets the capacity (number of slots) + void set_capacity(size_t capacity) { + ham_assert(capacity > 0); + *(uint32_t *)(m_data + 8) = (uint32_t)capacity; + } + + // The physical data in the node + uint8_t *m_data; + + // The size of the offset; either 16 or 32 bits, depending on page size + size_t m_sizeof_offset; + + // The size of the range, in bytes + size_t m_range_size; + + // A counter to indicate when rearranging the data makes sense + int m_vacuumize_counter; +}; + +} // namespace DefLayout + +} // namespace hamsterdb + +#endif /* HAM_BTREE_UPFRONT_INDEX_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3cache/cache.h b/plugins/Dbx_kv/src/hamsterdb/src/3cache/cache.h new file mode 100644 index 0000000000..a24daf3828 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3cache/cache.h @@ -0,0 +1,244 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The Cache Manager + * + * Stores pages in a non-intrusive hash table (each Page instance keeps + * next/previous pointers for the overflow bucket). Can efficiently purge + * unused pages, because all pages are also stored in a (non-intrusive) + * linked list, and whenever a page is accessed it is removed and re-inserted + * at the head. The tail therefore points to the page which was not used + * in a long time, and is the primary candidate for purging. + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_CACHE_H +#define HAM_CACHE_H + +#include "0root/root.h" + +#include <vector> + +#include "ham/hamsterdb_int.h" + +// Always verify that a file of level N does not include headers > N! +#include "2page/page.h" +#include "2page/page_collection.h" +#include "2config/env_config.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Cache +{ + enum { + // The number of buckets should be a prime number or similar, as it + // is used in a MODULO hash scheme + kBucketSize = 10317, + }; + + template<typename Purger> + struct PurgeIfSelector + { + PurgeIfSelector(Cache *cache, Purger &purger) + : m_cache(cache), m_purger(purger) { + } + + bool operator()(Page *page) { + if (m_purger(page)) { + m_cache->del(page); + delete page; + } + // don't remove page from list; it was already removed above + return (false); + } + + Cache *m_cache; + Purger &m_purger; + }; + + public: + // The default constructor + Cache(const EnvironmentConfiguration &config) + : m_capacity_bytes(config.flags & HAM_CACHE_UNLIMITED + ? 0xffffffffffffffffull + : config.cache_size_bytes), + m_page_size_bytes(config.page_size_bytes), + m_alloc_elements(0), m_totallist(Page::kListCache), + m_buckets(kBucketSize, PageCollection(Page::kListBucket)), + m_cache_hits(0), m_cache_misses(0) { + ham_assert(m_capacity_bytes > 0); + } + + // Fills in the current metrics + void fill_metrics(ham_env_metrics_t *metrics) const { + metrics->cache_hits = m_cache_hits; + metrics->cache_misses = m_cache_misses; + } + + // Retrieves a page from the cache, also removes the page from the cache + // and re-inserts it at the front. Returns null if the page was not cached. + Page *get(uint64_t address) { + size_t hash = calc_hash(address); + + Page *page = m_buckets[hash].get(address);; + if (!page) { + m_cache_misses++; + return (0); + } + + // Now re-insert the page at the head of the "totallist", and + // thus move far away from the tail. The pages at the tail are highest + // candidates to be deleted when the cache is purged. + m_totallist.del(page); + m_totallist.put(page); + m_cache_hits++; + return (page); + } + + // Stores a page in the cache + void put(Page *page) { + size_t hash = calc_hash(page->get_address()); + ham_assert(page->get_data()); + + /* First remove the page from the cache, if it's already cached + * + * Then re-insert the page at the head of the list. The tail will + * point to the least recently used page. + */ + m_totallist.del(page); + m_totallist.put(page); + + if (page->is_allocated()) + m_alloc_elements++; + m_buckets[hash].put(page); + } + + // Removes a page from the cache + void del(Page *page) { + ham_assert(page->get_address() != 0); + size_t hash = calc_hash(page->get_address()); + /* remove the page from the cache buckets */ + m_buckets[hash].del(page); + + /* remove it from the list of all cached pages */ + if (m_totallist.del(page) && page->is_allocated()) + m_alloc_elements--; + } + + // Purges the cache. Implements a LRU eviction algorithm. Dirty pages are + // forwarded to the |processor()| for flushing. + // + // Tries to purge at least 20 pages. In benchmarks this has proven to + // be a good limit. + template<typename Processor> + void purge(Processor &processor, Page *ignore_page) { + int limit = int(current_elements() + - (m_capacity_bytes / m_page_size_bytes)); + + Page *page = m_totallist.tail(); + for (int i = 0; i < limit && page != 0; i++) { + Page *next = page->get_previous(Page::kListCache); + + // dirty pages are flushed by the worker thread + if (page->is_dirty()) { + processor(page); + page = next; + continue; + } + // non-dirty pages are deleted if possible + if (!page->is_dirty() + && page->cursor_list() == 0 + && page != ignore_page + && page->mutex().try_lock()) { + del(page); + page->mutex().unlock(); + delete page; + } + + page = next; + } + } + + // Visits all pages in the "totallist". If |cb| returns true then the + // page is removed and deleted. This is used by the Environment + // to flush (and delete) pages. + template<typename Purger> + void purge_if(Purger &purger) { + PurgeIfSelector<Purger> selector(this, purger); + m_totallist.extract(selector); + } + + // Returns true if the capacity limits are exceeded + bool is_cache_full() const { + return (current_elements() * m_page_size_bytes + > m_capacity_bytes); + } + + // Returns the capacity (in bytes) + uint64_t capacity() const { + return (m_capacity_bytes); + } + + // Returns the number of currently cached elements + size_t current_elements() const { + return (m_totallist.size()); + } + + // Returns the number of currently cached elements (excluding those that + // are mmapped) + size_t allocated_elements() const { + return (m_alloc_elements); + } + + private: + // Calculates the hash of a page address + size_t calc_hash(uint64_t value) const { + return ((size_t)(value % Cache::kBucketSize)); + } + + // the capacity (in bytes) + uint64_t m_capacity_bytes; + + // the current page size (in bytes) + uint64_t m_page_size_bytes; + + // the current number of cached elements that were allocated (and not + // mapped) + size_t m_alloc_elements; + + // linked list of ALL cached pages + PageCollection m_totallist; + + // The hash table buckets - each is a linked list of Page pointers + std::vector<PageCollection> m_buckets; + + // counts the cache hits + uint64_t m_cache_hits; + + // counts the cache misses + uint64_t m_cache_misses; +}; + +} // namespace hamsterdb + +#endif /* HAM_CACHE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3changeset/changeset.cc b/plugins/Dbx_kv/src/hamsterdb/src/3changeset/changeset.cc new file mode 100644 index 0000000000..2e5ace06f5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3changeset/changeset.cc @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Always verify that a file of level N does not include headers > N! +#include "1errorinducer/errorinducer.h" +#include "2device/device.h" +#include "2page/page.h" +#include "3changeset/changeset.h" +#include "3journal/journal.h" +#include "3page_manager/page_manager.h" +#include "4db/db.h" +#include "4env/env_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* a unittest hook for Changeset::flush() */ +void (*g_CHANGESET_POST_LOG_HOOK)(void); + +struct PageCollectionVisitor +{ + PageCollectionVisitor(Page **pages) + : num_pages(0), pages(pages) { + } + + void prepare(size_t size) { + } + + bool operator()(Page *page) { + if (page->is_dirty() == true) { + pages[num_pages] = page; + ++num_pages; + } + // |page| is now removed from the Changeset + page->mutex().unlock(); + return (true); + } + + int num_pages; + Page **pages; +}; + +void +Changeset::flush(uint64_t lsn) +{ + // now flush all modified pages to disk + if (m_collection.is_empty()) + return; + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); + + // Fetch the pages, ignoring all pages that are not dirty + Page **pages = (Page **)::alloca(sizeof(Page *) * m_collection.size()); + PageCollectionVisitor visitor(pages); + m_collection.extract(visitor); + + // TODO sort by address (really?) + + if (visitor.num_pages == 0) + return; + + // If only one page is modified then the modification is atomic. The page + // is written to the btree (no log required). + // + // If more than one page is modified then the modification is no longer + // atomic. All dirty pages are written to the log. + if (visitor.num_pages > 1) { + m_env->journal()->append_changeset((const Page **)visitor.pages, + visitor.num_pages, lsn); + } + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); + + /* execute a post-log hook; this hook is set by the unittest framework + * and can be used to make a backup copy of the logfile */ + if (g_CHANGESET_POST_LOG_HOOK) + g_CHANGESET_POST_LOG_HOOK(); + + /* now write all the pages to the file; if any of these writes fail, + * we can still recover from the log */ + for (int i = 0; i < visitor.num_pages; i++) { + Page *p = visitor.pages[i]; + if (p->is_without_header() == false) + p->set_lsn(lsn); + p->flush(); + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); + } + + /* flush the file handle (if required) */ + if (m_env->get_flags() & HAM_ENABLE_FSYNC) + m_env->device()->flush(); + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3changeset/changeset.h b/plugins/Dbx_kv/src/hamsterdb/src/3changeset/changeset.h new file mode 100644 index 0000000000..a21c6f45f9 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3changeset/changeset.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A changeset collects all pages that are modified during a single + * operation. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_CHANGESET_H +#define HAM_CHANGESET_H + +#include "0root/root.h" + +#include <stdlib.h> + +// Always verify that a file of level N does not include headers > N! +#include "2page/page.h" +#include "2page/page_collection.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class LocalEnvironment; + +class Changeset +{ + struct UnlockPage + { + bool operator()(Page *page) { + #ifdef HAM_ENABLE_HELGRIND + page->mutex().try_lock(); + #endif + page->mutex().unlock(); + return (true); + } + }; + + public: + Changeset(LocalEnvironment *env) + : m_env(env), m_collection(Page::kListChangeset) { + } + + /* + * Returns a page from the changeset, or NULL if the page is not part + * of the changeset + */ + Page *get(uint64_t address) { + return (m_collection.get(address)); + } + + /* Append a new page to the changeset. The page is locked. */ + void put(Page *page) { + if (!has(page)) { + page->mutex().lock(); + } + m_collection.put(page); + } + + /* Removes a page from the changeset. The page is unlocked. */ + void del(Page *page) { + page->mutex().unlock(); + m_collection.del(page); + } + + /* Check if the page is already part of the changeset */ + bool has(Page *page) const { + return (m_collection.has(page)); + } + + /* Returns true if the changeset is empty */ + bool is_empty() const { + return (m_collection.is_empty()); + } + + /* Removes all pages from the changeset. The pages are unlocked. */ + void clear() { + UnlockPage unlocker; + m_collection.for_each(unlocker); + m_collection.clear(); + } + + /* + * Flush all pages in the changeset - first write them to the log, then + * write them to the disk. + * On success: will clear the changeset and the journal + */ + void flush(uint64_t lsn); + + private: + /* The Environment */ + LocalEnvironment *m_env; + + /* The pages which were added to this Changeset */ + PageCollection m_collection; +}; + +} // namespace hamsterdb + +#endif /* HAM_CHANGESET_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal.cc b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal.cc new file mode 100644 index 0000000000..50e749240f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal.cc @@ -0,0 +1,862 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> +#ifndef HAM_OS_WIN32 +# include <libgen.h> +#endif + +#include "1base/error.h" +#include "1errorinducer/errorinducer.h" +#include "1os/os.h" +#include "2device/device.h" +#include "3journal/journal.h" +#include "3page_manager/page_manager.h" +#include "4db/db.h" +#include "4txn/txn_local.h" +#include "4env/env_local.h" +#include "4context/context.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +Journal::Journal(LocalEnvironment *env) + : m_state(env) +{ +} + +void +Journal::create() +{ + // create the two files + for (int i = 0; i < 2; i++) { + std::string path = get_path(i); + m_state.files[i].create(path.c_str(), 0644); + } +} + +void +Journal::open() +{ + // open the two files + try { + std::string path = get_path(0); + m_state.files[0].open(path.c_str(), false); + path = get_path(1); + m_state.files[1].open(path.c_str(), 0); + } + catch (Exception &ex) { + m_state.files[1].close(); + m_state.files[0].close(); + throw ex; + } +} + +int +Journal::switch_files_maybe() +{ + int other = m_state.current_fd ? 0 : 1; + + // determine the journal file which is used for this transaction + // if the "current" file is not yet full, continue to write to this file + if (m_state.open_txn[m_state.current_fd] + + m_state.closed_txn[m_state.current_fd] + < m_state.threshold) + return (m_state.current_fd); + + // If the other file does no longer have open Transactions then + // delete the other file and use the other file as the current file + if (m_state.open_txn[other] == 0) { + clear_file(other); + m_state.current_fd = other; + // fall through + } + + // Otherwise just continue using the current file + return (m_state.current_fd); +} + +void +Journal::append_txn_begin(LocalTransaction *txn, const char *name, uint64_t lsn) +{ + if (m_state.disable_logging) + return; + + ham_assert((txn->get_flags() & HAM_TXN_TEMPORARY) == 0); + + PJournalEntry entry; + entry.txn_id = txn->get_id(); + entry.type = kEntryTypeTxnBegin; + entry.lsn = lsn; + if (name) + entry.followup_size = strlen(name) + 1; + + txn->set_log_desc(switch_files_maybe()); + + int cur = txn->get_log_desc(); + + if (txn->get_name().size()) + append_entry(cur, (uint8_t *)&entry, (uint32_t)sizeof(entry), + (uint8_t *)txn->get_name().c_str(), + (uint32_t)txn->get_name().size() + 1); + else + append_entry(cur, (uint8_t *)&entry, (uint32_t)sizeof(entry)); + maybe_flush_buffer(cur); + + m_state.open_txn[cur]++; + + // store the fp-index in the journal structure; it's needed for + // journal_append_checkpoint() to quickly find out which file is + // the newest + m_state.current_fd = cur; +} + +void +Journal::append_txn_abort(LocalTransaction *txn, uint64_t lsn) +{ + if (m_state.disable_logging) + return; + + ham_assert((txn->get_flags() & HAM_TXN_TEMPORARY) == 0); + + int idx; + PJournalEntry entry; + entry.lsn = lsn; + entry.txn_id = txn->get_id(); + entry.type = kEntryTypeTxnAbort; + + // update the transaction counters of this logfile + idx = txn->get_log_desc(); + m_state.open_txn[idx]--; + m_state.closed_txn[idx]++; + + append_entry(idx, (uint8_t *)&entry, sizeof(entry)); + maybe_flush_buffer(idx); + // no need for fsync - incomplete transactions will be aborted anyway +} + +void +Journal::append_txn_commit(LocalTransaction *txn, uint64_t lsn) +{ + if (m_state.disable_logging) + return; + + ham_assert((txn->get_flags() & HAM_TXN_TEMPORARY) == 0); + + PJournalEntry entry; + entry.lsn = lsn; + entry.txn_id = txn->get_id(); + entry.type = kEntryTypeTxnCommit; + + // do not yet update the transaction counters of this logfile; just + // because the txn was committed does not mean that it will be flushed + // immediately. The counters will be modified in transaction_flushed(). + int idx = txn->get_log_desc(); + + append_entry(idx, (uint8_t *)&entry, sizeof(entry)); + + // and flush the file + flush_buffer(idx, m_state.env->get_flags() & HAM_ENABLE_FSYNC); +} + +void +Journal::append_insert(Database *db, LocalTransaction *txn, + ham_key_t *key, ham_record_t *record, uint32_t flags, + uint64_t lsn) +{ + if (m_state.disable_logging) + return; + + PJournalEntry entry; + PJournalEntryInsert insert; + uint32_t size = sizeof(PJournalEntryInsert) + + key->size + + (flags & HAM_PARTIAL + ? record->partial_size + : record->size) + - 1; + + entry.lsn = lsn; + entry.dbname = db->name(); + entry.type = kEntryTypeInsert; + entry.followup_size = size; + + int idx; + if (txn->get_flags() & HAM_TXN_TEMPORARY) { + entry.txn_id = 0; + idx = switch_files_maybe(); + m_state.closed_txn[idx]++; + } + else { + entry.txn_id = txn->get_id(); + idx = txn->get_log_desc(); + } + + insert.key_size = key->size; + insert.record_size = record->size; + insert.record_partial_size = record->partial_size; + insert.record_partial_offset = record->partial_offset; + insert.insert_flags = flags; + + // append the entry to the logfile + append_entry(idx, (uint8_t *)&entry, sizeof(entry), + (uint8_t *)&insert, sizeof(PJournalEntryInsert) - 1, + (uint8_t *)key->data, key->size, + (uint8_t *)record->data, (flags & HAM_PARTIAL + ? record->partial_size + : record->size)); + maybe_flush_buffer(idx); +} + +void +Journal::append_erase(Database *db, LocalTransaction *txn, ham_key_t *key, + int duplicate_index, uint32_t flags, uint64_t lsn) +{ + if (m_state.disable_logging) + return; + + PJournalEntry entry; + PJournalEntryErase erase; + uint32_t size = sizeof(PJournalEntryErase) + key->size - 1; + + entry.lsn = lsn; + entry.dbname = db->name(); + entry.type = kEntryTypeErase; + entry.followup_size = size; + erase.key_size = key->size; + erase.erase_flags = flags; + erase.duplicate = duplicate_index; + + int idx; + if (txn->get_flags() & HAM_TXN_TEMPORARY) { + entry.txn_id = 0; + idx = switch_files_maybe(); + m_state.closed_txn[idx]++; + } + else { + entry.txn_id = txn->get_id(); + idx = txn->get_log_desc(); + } + + // append the entry to the logfile + append_entry(idx, (uint8_t *)&entry, sizeof(entry), + (uint8_t *)&erase, sizeof(PJournalEntryErase) - 1, + (uint8_t *)key->data, key->size); + maybe_flush_buffer(idx); +} + +void +Journal::append_changeset(const Page **pages, int num_pages, uint64_t lsn) +{ + if (m_state.disable_logging) + return; + + PJournalEntry entry; + PJournalEntryChangeset changeset; + + entry.lsn = lsn; + entry.dbname = 0; + entry.txn_id = 0; + entry.type = kEntryTypeChangeset; + // followup_size is incomplete - the actual page sizes are added later + entry.followup_size = sizeof(PJournalEntryChangeset); + changeset.num_pages = num_pages; + + // we need the current position in the file buffer. if compression is enabled + // then we do not know the actual followup-size of this entry. it will be + // patched in later. + uint32_t entry_position = m_state.buffer[m_state.current_fd].get_size(); + + // write the data to the file + append_entry(m_state.current_fd, (uint8_t *)&entry, sizeof(entry), + (uint8_t *)&changeset, sizeof(PJournalEntryChangeset)); + + size_t page_size = m_state.env->config().page_size_bytes; + for (int i = 0; i < num_pages; i++) { + entry.followup_size += append_changeset_page(pages[i], page_size); + } + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); + + // and patch in the followup-size + m_state.buffer[m_state.current_fd].overwrite(entry_position, + (uint8_t *)&entry, sizeof(entry)); + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); + + // and flush the file + flush_buffer(m_state.current_fd, m_state.env->get_flags() & HAM_ENABLE_FSYNC); + + HAM_INDUCE_ERROR(ErrorInducer::kChangesetFlush); + + // if recovery is enabled (w/o transactions) then simulate a "commit" to + // make sure that the log files are switched properly + m_state.closed_txn[m_state.current_fd]++; + (void)switch_files_maybe(); +} + +uint32_t +Journal::append_changeset_page(const Page *page, uint32_t page_size) +{ + PJournalEntryPageHeader header(page->get_address()); + + append_entry(m_state.current_fd, (uint8_t *)&header, sizeof(header), + page->get_raw_payload(), page_size); + return (page_size + sizeof(header)); +} + +void +Journal::transaction_flushed(LocalTransaction *txn) +{ + ham_assert((txn->get_flags() & HAM_TXN_TEMPORARY) == 0); + if (m_state.disable_logging) // ignore this call during recovery + return; + + int idx = txn->get_log_desc(); + ham_assert(m_state.open_txn[idx] > 0); + m_state.open_txn[idx]--; + m_state.closed_txn[idx]++; +} + +void +Journal::get_entry(Iterator *iter, PJournalEntry *entry, ByteArray *auxbuffer) +{ + uint64_t filesize; + + auxbuffer->clear(); + + // if iter->offset is 0, then the iterator was created from scratch + // and we start reading from the first (oldest) entry. + // + // The oldest of the two logfiles is always the "other" one (the one + // NOT in current_fd). + if (iter->offset == 0) { + iter->fdstart = iter->fdidx = + m_state.current_fd == 0 + ? 1 + : 0; + } + + // get the size of the journal file + filesize = m_state.files[iter->fdidx].get_file_size(); + + // reached EOF? then either skip to the next file or we're done + if (filesize == iter->offset) { + if (iter->fdstart == iter->fdidx) { + iter->fdidx = iter->fdidx == 1 ? 0 : 1; + iter->offset = 0; + filesize = m_state.files[iter->fdidx].get_file_size(); + } + else { + entry->lsn = 0; + return; + } + } + + // second file is also empty? then return + if (filesize == iter->offset) { + entry->lsn = 0; + return; + } + + // now try to read the next entry + try { + m_state.files[iter->fdidx].pread(iter->offset, entry, sizeof(*entry)); + + iter->offset += sizeof(*entry); + + // read auxiliary data if it's available + if (entry->followup_size) { + auxbuffer->resize((uint32_t)entry->followup_size); + + m_state.files[iter->fdidx].pread(iter->offset, auxbuffer->get_ptr(), + (size_t)entry->followup_size); + iter->offset += entry->followup_size; + } + } + catch (Exception &) { + ham_trace(("failed to read journal entry, aborting recovery")); + entry->lsn = 0; // this triggers the end of recovery + } +} + +void +Journal::close(bool noclear) +{ + int i; + + // the noclear flag is set during testing, for checking whether the files + // contain the correct data. Flush the buffers, otherwise the tests will + // fail because data is missing + if (noclear) { + flush_buffer(0); + flush_buffer(1); + } + + if (!noclear) + clear(); + + for (i = 0; i < 2; i++) { + m_state.files[i].close(); + m_state.buffer[i].clear(); + } +} + +Database * +Journal::get_db(uint16_t dbname) +{ + // first check if the Database is already open + JournalState::DatabaseMap::iterator it = m_state.database_map.find(dbname); + if (it != m_state.database_map.end()) + return (it->second); + + // not found - open it + Database *db = 0; + DatabaseConfiguration config; + config.db_name = dbname; + ham_status_t st = m_state.env->open_db(&db, config, 0); + if (st) + throw Exception(st); + m_state.database_map[dbname] = db; + return (db); +} + +Transaction * +Journal::get_txn(LocalTransactionManager *txn_manager, uint64_t txn_id) +{ + Transaction *txn = txn_manager->get_oldest_txn(); + while (txn) { + if (txn->get_id() == txn_id) + return (txn); + txn = txn->get_next(); + } + + return (0); +} + +void +Journal::close_all_databases() +{ + ham_status_t st = 0; + + JournalState::DatabaseMap::iterator it = m_state.database_map.begin(); + while (it != m_state.database_map.end()) { + JournalState::DatabaseMap::iterator it2 = it; it++; + st = ham_db_close((ham_db_t *)it2->second, HAM_DONT_LOCK); + if (st) { + ham_log(("ham_db_close() failed w/ error %d (%s)", st, ham_strerror(st))); + throw Exception(st); + } + } + m_state.database_map.clear(); +} + +void +Journal::abort_uncommitted_txns(LocalTransactionManager *txn_manager) +{ + Transaction *txn = txn_manager->get_oldest_txn(); + + while (txn) { + if (!txn->is_committed()) + txn->abort(); + txn = txn->get_next(); + } +} + +void +Journal::recover(LocalTransactionManager *txn_manager) +{ + Context context(m_state.env, 0, 0); + + // first re-apply the last changeset + uint64_t start_lsn = recover_changeset(); + + // load the state of the PageManager; the PageManager state is loaded AFTER + // physical recovery because its page might have been restored in + // recover_changeset() + uint64_t page_manager_blobid = m_state.env->header()->get_page_manager_blobid(); + if (page_manager_blobid != 0) { + m_state.env->page_manager()->initialize(page_manager_blobid); + } + + // then start the normal recovery + if (m_state.env->get_flags() & HAM_ENABLE_TRANSACTIONS) + recover_journal(&context, txn_manager, start_lsn); +} + +uint64_t +Journal::scan_for_newest_changeset(File *file, uint64_t *position) +{ + Iterator it; + PJournalEntry entry; + ByteArray buffer; + uint64_t result = 0; + + // get the next entry + try { + uint64_t filesize = file->get_file_size(); + + while (it.offset < filesize) { + file->pread(it.offset, &entry, sizeof(entry)); + + if (entry.lsn == 0) + break; + + if (entry.type == kEntryTypeChangeset) { + *position = it.offset; + result = entry.lsn; + } + + // increment the offset + it.offset += sizeof(entry); + if (entry.followup_size) + it.offset += entry.followup_size; + } + } + catch (Exception &ex) { + ham_log(("exception (error %d) while reading journal", ex.code)); + } + + return (result); +} + +uint64_t +Journal::recover_changeset() +{ + // scan through both files, look for the file with the newest changeset + uint64_t position0, position1, position; + uint64_t lsn1 = scan_for_newest_changeset(&m_state.files[0], &position0); + uint64_t lsn2 = scan_for_newest_changeset(&m_state.files[1], &position1); + + // both files are empty or do not contain a changeset? + if (lsn1 == 0 && lsn2 == 0) + return (0); + + // re-apply the newest changeset + m_state.current_fd = lsn1 > lsn2 ? 0 : 1; + position = lsn1 > lsn2 ? position0 : position1; + + PJournalEntry entry; + uint64_t start_lsn = 0; + + try { + m_state.files[m_state.current_fd].pread(position, &entry, sizeof(entry)); + position += sizeof(entry); + ham_assert(entry.type == kEntryTypeChangeset); + + // Read the Changeset header + PJournalEntryChangeset changeset; + m_state.files[m_state.current_fd].pread(position, &changeset, + sizeof(changeset)); + position += sizeof(changeset); + + uint32_t page_size = m_state.env->config().page_size_bytes; + ByteArray arena(page_size); + + uint64_t file_size = m_state.env->device()->file_size(); + + // for each page in this changeset... + for (uint32_t i = 0; i < changeset.num_pages; i++) { + PJournalEntryPageHeader page_header; + m_state.files[m_state.current_fd].pread(position, &page_header, + sizeof(page_header)); + position += sizeof(page_header); + m_state.files[m_state.current_fd].pread(position, arena.get_ptr(), + page_size); + position += page_size; + + Page *page; + + // now write the page to disk + if (page_header.address == file_size) { + file_size += page_size; + + page = new Page(m_state.env->device()); + page->alloc(0); + } + else if (page_header.address > file_size) { + file_size = (size_t)page_header.address + page_size; + m_state.env->device()->truncate(file_size); + + page = new Page(m_state.env->device()); + page->fetch(page_header.address); + } + else { + page = new Page(m_state.env->device()); + page->fetch(page_header.address); + } + + // only overwrite the page data if the page's last modification + // is OLDER than the changeset! + bool skip = false; + if (page->is_without_header() == false) { + if (page->get_lsn() > entry.lsn) { + skip = true; + start_lsn = page->get_lsn(); + } + } + + if (!skip) { + // overwrite the page data + memcpy(page->get_data(), arena.get_ptr(), page_size); + + ham_assert(page->get_address() == page_header.address); + + // flush the modified page to disk + page->set_dirty(true); + page->flush(); + } + + delete page; + } + } + catch (Exception &) { + ham_trace(("Exception when applying changeset; skipping changeset")); + // fall through + } + + return (std::max(start_lsn, entry.lsn)); +} + +void +Journal::recover_journal(Context *context, + LocalTransactionManager *txn_manager, uint64_t start_lsn) +{ + ham_status_t st = 0; + Iterator it; + ByteArray buffer; + + /* recovering the journal is rather simple - we iterate over the + * files and re-apply EVERY operation (incl. txn_begin and txn_abort), + * that was not yet flushed with a Changeset. + * + * Basically we iterate over both log files and skip everything with + * a sequence number (lsn) smaller the one of the last Changeset. + * + * When done then auto-abort all transactions that were not yet + * committed. + */ + + // make sure that there are no pending transactions - start with + // a clean state! + ham_assert(txn_manager->get_oldest_txn() == 0); + ham_assert(m_state.env->get_flags() & HAM_ENABLE_TRANSACTIONS); + ham_assert(m_state.env->get_flags() & HAM_ENABLE_RECOVERY); + + // do not append to the journal during recovery + m_state.disable_logging = true; + + do { + PJournalEntry entry; + + // get the next entry + get_entry(&it, &entry, &buffer); + + // reached end of logfile? + if (!entry.lsn) + break; + + // re-apply this operation + switch (entry.type) { + case kEntryTypeTxnBegin: { + Transaction *txn = 0; + st = ham_txn_begin((ham_txn_t **)&txn, (ham_env_t *)m_state.env, + (const char *)buffer.get_ptr(), 0, HAM_DONT_LOCK); + // on success: patch the txn ID + if (st == 0) { + txn->set_id(entry.txn_id); + txn_manager->set_txn_id(entry.txn_id); + } + break; + } + case kEntryTypeTxnAbort: { + Transaction *txn = get_txn(txn_manager, entry.txn_id); + st = ham_txn_abort((ham_txn_t *)txn, HAM_DONT_LOCK); + break; + } + case kEntryTypeTxnCommit: { + Transaction *txn = get_txn(txn_manager, entry.txn_id); + st = ham_txn_commit((ham_txn_t *)txn, HAM_DONT_LOCK); + break; + } + case kEntryTypeInsert: { + PJournalEntryInsert *ins = (PJournalEntryInsert *)buffer.get_ptr(); + Transaction *txn = 0; + Database *db; + ham_key_t key = {0}; + ham_record_t record = {0}; + if (!ins) { + st = HAM_IO_ERROR; + goto bail; + } + + // do not insert if the key was already flushed to disk + if (entry.lsn <= start_lsn) + continue; + + key.data = ins->get_key_data(); + key.size = ins->key_size; + record.data = ins->get_record_data(); + record.size = ins->record_size; + record.partial_size = ins->record_partial_size; + record.partial_offset = ins->record_partial_offset; + if (entry.txn_id) + txn = get_txn(txn_manager, entry.txn_id); + db = get_db(entry.dbname); + st = ham_db_insert((ham_db_t *)db, (ham_txn_t *)txn, + &key, &record, ins->insert_flags | HAM_DONT_LOCK); + break; + } + case kEntryTypeErase: { + PJournalEntryErase *e = (PJournalEntryErase *)buffer.get_ptr(); + Transaction *txn = 0; + Database *db; + ham_key_t key = {0}; + if (!e) { + st = HAM_IO_ERROR; + goto bail; + } + + // do not erase if the key was already erased from disk + if (entry.lsn <= start_lsn) + continue; + + if (entry.txn_id) + txn = get_txn(txn_manager, entry.txn_id); + db = get_db(entry.dbname); + key.data = e->get_key_data(); + key.size = e->key_size; + st = ham_db_erase((ham_db_t *)db, (ham_txn_t *)txn, &key, + e->erase_flags | HAM_DONT_LOCK); + // key might have already been erased when the changeset + // was flushed + if (st == HAM_KEY_NOT_FOUND) + st = 0; + break; + } + case kEntryTypeChangeset: { + // skip this; the changeset was already applied + break; + } + default: + ham_log(("invalid journal entry type or journal is corrupt")); + st = HAM_IO_ERROR; + } + + if (st) + goto bail; + } while (1); + +bail: + // all transactions which are not yet committed will be aborted + abort_uncommitted_txns(txn_manager); + + // also close and delete all open databases - they were created in get_db() + close_all_databases(); + + // flush all committed transactions + if (st == 0) + st = m_state.env->flush(HAM_FLUSH_COMMITTED_TRANSACTIONS); + + // re-enable the logging + m_state.disable_logging = false; + + if (st) + throw Exception(st); + + // clear the journal files + clear(); +} + +void +Journal::clear_file(int idx) +{ + if (m_state.files[idx].is_open()) { + m_state.files[idx].truncate(0); + + // after truncate, the file pointer is far beyond the new end of file; + // reset the file pointer, or the next write will resize the file to + // the original size + m_state.files[idx].seek(0, File::kSeekSet); + } + + // clear the transaction counters + m_state.open_txn[idx] = 0; + m_state.closed_txn[idx] = 0; + + // also clear the buffer with the outstanding data + m_state.buffer[idx].clear(); +} + +std::string +Journal::get_path(int i) +{ + std::string path; + + if (m_state.env->config().log_filename.empty()) { + path = m_state.env->config().filename; + } + else { + path = m_state.env->config().log_filename; +#ifdef HAM_OS_WIN32 + path += "\\"; + char fname[_MAX_FNAME]; + char ext[_MAX_EXT]; + _splitpath(m_state.env->config().filename.c_str(), 0, 0, fname, ext); + path += fname; + path += ext; +#else + path += "/"; + path += ::basename((char *)m_state.env->config().filename.c_str()); +#endif + } + if (i == 0) + path += ".jrn0"; + else if (i == 1) + path += ".jrn1"; + else + ham_assert(!"invalid index"); + return (path); +} + +JournalTest +Journal::test() +{ + return (JournalTest(&m_state)); +} + +JournalState::JournalState(LocalEnvironment *env) + : env(env), current_fd(0), threshold(env->config().journal_switch_threshold), + disable_logging(false), count_bytes_flushed(0), + count_bytes_before_compression(0), count_bytes_after_compression(0) +{ + if (threshold == 0) + threshold = kSwitchTxnThreshold; + + open_txn[0] = 0; + open_txn[1] = 0; + closed_txn[0] = 0; + closed_txn[1] = 0; +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal.h b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal.h new file mode 100644 index 0000000000..dd55b66fea --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal.h @@ -0,0 +1,329 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Routines for the journal - writing, reading, recovering + * + * The journal is a facility for storing logical and physical redo-information. + * + * The logical information describes the database operation (i.e. insert/erase), + * the physical information describes the modified pages. + * + * "Undo" information is not required because aborted Transactions are never + * written to disk. The journal only can "redo" operations. + * + * The journal is organized in two files. If one of the files grows too large + * then all new Transactions are stored in the other file + * ("Log file switching"). When all Transactions from file #0 are committed, + * and file #1 exceeds a limit, then the files are switched back again. + * + * For writing, files are buffered. The buffers are flushed when they + * exceed a certain threshold, when a Transaction is committed or a Changeset + * was written. In case of a commit or a changeset there will also be an + * fsync, if HAM_ENABLE_FSYNC is enabled. + * + * The physical information is a collection of pages which are modified in + * one or more database operations (i.e. ham_db_erase). This collection is + * called a "changeset" and implemented in changeset.h/.cc. As soon as the + * operation is finished, the changeset is flushed: if the changeset contains + * just a single page, then this operation is atomic and is NOT logged. + * Otherwise the whole changeset is appended to the journal, and afterwards + * the database file is modified. + * + * For recovery to work, each page stores the lsn of its last modification. + * + * When recovering, the Journal first extracts the newest/latest entry. + * If this entry is a changeset then the changeset is reapplied, because + * we assume that there was a crash immediately AFTER the changeset was + * written, but BEFORE the database file was modified. (The changeset is + * idempotent; if the database file was successfully modified then the + * changes are re-applied; this is not a problem.) + * + * Afterwards, hamsterdb uses the lsn's to figure out whether an update + * was already applied or not. If the journal's last entry is a changeset then + * this changeset's lsn marks the beginning of the sequence. Otherwise the lsn + * is fetched from the journal file headers. All journal entries with an lsn + * *older* than this start-lsn will be skipped, all others are re-applied. + * + * In this phase all changesets are skipped because the newest changeset was + * already applied, and we know that all older changesets + * have already been written successfully to the database file. + * + * @exception_safe: basic + * @thread_safe: no + */ + +#ifndef HAM_JOURNAL_H +#define HAM_JOURNAL_H + +#include "0root/root.h" + +#include <map> +#include <cstdio> +#include <string> + +#include "ham/hamsterdb_int.h" // for metrics + +#include "1base/dynamic_array.h" +#include "1os/file.h" +#include "1errorinducer/errorinducer.h" +#include "2page/page_collection.h" +#include "3journal/journal_entries.h" +#include "3journal/journal_state.h" +#include "3journal/journal_test.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class Page; +class Database; +class Transaction; +class LocalEnvironment; +class LocalTransaction; +class LocalTransactionManager; + +#include "1base/packstart.h" + +// +// The Journal object +// +class Journal +{ + public: + enum { + // marks the start of a new transaction + kEntryTypeTxnBegin = 1, + + // marks the end of an aborted transaction + kEntryTypeTxnAbort = 2, + + // marks the end of an committed transaction + kEntryTypeTxnCommit = 3, + + // marks an insert operation + kEntryTypeInsert = 4, + + // marks an erase operation + kEntryTypeErase = 5, + + // marks a whole changeset operation (writes modified pages) + kEntryTypeChangeset = 6 + }; + + // + // An "iterator" structure for traversing the journal files + // + struct Iterator { + Iterator() + : fdidx(0), fdstart(0), offset(0) { + } + + // selects the file descriptor [0..1] + int fdidx; + + // which file descriptor did we start with? [0..1] + int fdstart; + + // the offset in the file of the NEXT entry + uint64_t offset; + }; + + // Constructor + Journal(LocalEnvironment *env); + + // Creates a new journal + void create(); + + // Opens an existing journal + void open(); + + // Returns true if the journal is empty + bool is_empty() { + if (!m_state.files[0].is_open() && !m_state.files[1].is_open()) + return (true); + + for (int i = 0; i < 2; i++) { + uint64_t size = m_state.files[i].get_file_size(); + if (size > 0) + return (false); + } + + return (true); + } + + // Appends a journal entry for ham_txn_begin/kEntryTypeTxnBegin + void append_txn_begin(LocalTransaction *txn, const char *name, + uint64_t lsn); + + // Appends a journal entry for ham_txn_abort/kEntryTypeTxnAbort + void append_txn_abort(LocalTransaction *txn, uint64_t lsn); + + // Appends a journal entry for ham_txn_commit/kEntryTypeTxnCommit + void append_txn_commit(LocalTransaction *txn, uint64_t lsn); + + // Appends a journal entry for ham_insert/kEntryTypeInsert + void append_insert(Database *db, LocalTransaction *txn, + ham_key_t *key, ham_record_t *record, uint32_t flags, + uint64_t lsn); + + // Appends a journal entry for ham_erase/kEntryTypeErase + void append_erase(Database *db, LocalTransaction *txn, + ham_key_t *key, int duplicate_index, uint32_t flags, + uint64_t lsn); + + // Appends a journal entry for a whole changeset/kEntryTypeChangeset + void append_changeset(const Page **pages, int num_pages, uint64_t lsn); + + // Adjusts the transaction counters; called whenever |txn| is flushed. + void transaction_flushed(LocalTransaction *txn); + + // Empties the journal, removes all entries + void clear() { + for (int i = 0; i < 2; i++) + clear_file(i); + } + + // Closes the journal, frees all allocated resources + void close(bool noclear = false); + + // Performs the recovery! All committed Transactions will be re-applied, + // all others are automatically aborted + void recover(LocalTransactionManager *txn_manager); + + // Fills the metrics + void fill_metrics(ham_env_metrics_t *metrics) { + metrics->journal_bytes_flushed = m_state.count_bytes_flushed; + } + + private: + friend struct JournalFixture; + + // Returns a pointer to database. If the database was not yet opened then + // it is opened implicitly. + Database *get_db(uint16_t dbname); + + // Returns a pointer to a Transaction object. + Transaction *get_txn(LocalTransactionManager *txn_manager, uint64_t txn_id); + + // Closes all databases. + void close_all_databases(); + + // Aborts all transactions which are still active. + void abort_uncommitted_txns(LocalTransactionManager *txn_manager); + + // Helper function which adds a single page from the changeset to + // the Journal; returns the page size (or compressed size, if compression + // was enabled) + uint32_t append_changeset_page(const Page *page, uint32_t page_size); + + // Recovers (re-applies) the physical changelog; returns the lsn of the + // Changelog + uint64_t recover_changeset(); + + // Scans a file for the newest changeset. Returns the lsn of this + // changeset, and the position (offset) in the file + uint64_t scan_for_newest_changeset(File *file, uint64_t *position); + + // Recovers the logical journal + void recover_journal(Context *context, + LocalTransactionManager *txn_manager, uint64_t start_lsn); + + // Switches the log file if necessary; returns the new log descriptor in the + // transaction + int switch_files_maybe(); + + // returns the path of the journal file + std::string get_path(int i); + + // Sequentially returns the next journal entry, starting with + // the oldest entry. + // + // |iter| must be initialized with zeroes for the first call. + // |auxbuffer| returns the auxiliary data of the entry and is either + // a structure of type PJournalEntryInsert or PJournalEntryErase. + // + // Returns an empty entry (lsn is zero) after the last element. + void get_entry(Iterator *iter, PJournalEntry *entry, + ByteArray *auxbuffer); + + // Appends an entry to the journal + void append_entry(int idx, + const uint8_t *ptr1 = 0, size_t ptr1_size = 0, + const uint8_t *ptr2 = 0, size_t ptr2_size = 0, + const uint8_t *ptr3 = 0, size_t ptr3_size = 0, + const uint8_t *ptr4 = 0, size_t ptr4_size = 0, + const uint8_t *ptr5 = 0, size_t ptr5_size = 0) { + if (ptr1_size) + m_state.buffer[idx].append(ptr1, ptr1_size); + if (ptr2_size) + m_state.buffer[idx].append(ptr2, ptr2_size); + if (ptr3_size) + m_state.buffer[idx].append(ptr3, ptr3_size); + if (ptr4_size) + m_state.buffer[idx].append(ptr4, ptr4_size); + if (ptr5_size) + m_state.buffer[idx].append(ptr5, ptr5_size); + } + + // flush buffer if size limit is exceeded + void maybe_flush_buffer(int idx) { + if (m_state.buffer[idx].get_size() >= JournalState::kBufferLimit) + flush_buffer(idx); + } + + // Flushes a buffer to disk + void flush_buffer(int idx, bool fsync = false) { + if (m_state.buffer[idx].get_size() > 0) { + // error inducer? then write only a part of the buffer and return + if (ErrorInducer::is_active() + && ErrorInducer::get_instance()->induce(ErrorInducer::kChangesetFlush)) { + m_state.files[idx].write(m_state.buffer[idx].get_ptr(), + m_state.buffer[idx].get_size() - 5); + throw Exception(HAM_INTERNAL_ERROR); + } + + m_state.files[idx].write(m_state.buffer[idx].get_ptr(), + m_state.buffer[idx].get_size()); + m_state.count_bytes_flushed += m_state.buffer[idx].get_size(); + + m_state.buffer[idx].clear(); + if (fsync) + m_state.files[idx].flush(); + } + } + + // Clears a single file + void clear_file(int idx); + + // Returns the test object + JournalTest test(); + + private: + // The mutable state + JournalState m_state; +}; + +#include "1base/packstop.h" + +} // namespace hamsterdb + +#endif /* HAM_JOURNAL_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_entries.h b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_entries.h new file mode 100644 index 0000000000..b32f53693b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_entries.h @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * journal entries for insert, erase, begin, commit, abort... + * + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_JOURNAL_ENTRIES_H +#define HAM_JOURNAL_ENTRIES_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +#include "1base/packstart.h" + +/* + * A journal entry for all txn related operations (begin, commit, abort) + * + * This structure can be followed by one of the structures below + * (PJournalEntryInsert or PJournalEntryERASE); the field |followup_size| + * is the structure size of this follow-up structure. + */ +HAM_PACK_0 struct HAM_PACK_1 PJournalEntry { + // Constructor - sets all fields to 0 + PJournalEntry() + : lsn(0), followup_size(0), txn_id(0), type(0), + dbname(0), _reserved(0) { + } + + // the lsn of this entry + uint64_t lsn; + + // the size of the follow-up entry in bytes (may be padded) + uint64_t followup_size; + + // the transaction id + uint64_t txn_id; + + // the type of this entry + uint32_t type; + + // the name of the database which is modified by this entry + uint16_t dbname; + + // a reserved value - reqd for padding + uint16_t _reserved; +} HAM_PACK_2; + +#include "1base/packstop.h" + + +#include "1base/packstart.h" + +// +// a Journal entry for an 'insert' operation +// +HAM_PACK_0 struct HAM_PACK_1 PJournalEntryInsert { + // Constructor - sets all fields to 0 + PJournalEntryInsert() + : key_size(0), compressed_key_size(0), record_size(0), + compressed_record_size(0), record_partial_size(0), + record_partial_offset(0), insert_flags(0) { + data[0] = 0; + } + + // key size + uint16_t key_size; + + // PRO: compressed key size + uint16_t compressed_key_size; + + // record size + uint32_t record_size; + + // PRO: compressed record size + uint32_t compressed_record_size; + + // record partial size + uint32_t record_partial_size; + + // record partial offset + uint32_t record_partial_offset; + + // flags of ham_insert(), ham_cursor_insert() + uint32_t insert_flags; + + // data follows here - first |key_size| bytes for the key, then + // |record_size| bytes for the record (and maybe some padding) + // + // PRO: this data can be compressed + uint8_t data[1]; + + // Returns a pointer to the key data + uint8_t *get_key_data() { + return (&data[0]); + } + + // Returns a pointer to the record data + uint8_t *get_record_data() { + return (&data[key_size]); + } +} HAM_PACK_2; + +#include "1base/packstop.h" + + +#include "1base/packstart.h" + +// +// a Journal entry for 'erase' operations +// +HAM_PACK_0 struct HAM_PACK_1 PJournalEntryErase { + // Constructor - sets all fields to 0 + PJournalEntryErase() + : key_size(0), compressed_key_size(0), erase_flags(0), duplicate(0) { + data[0] = 0; + } + + // key size + uint16_t key_size; + + // PRO: compressed key size + uint16_t compressed_key_size; + + // flags of ham_erase(), ham_cursor_erase() + uint32_t erase_flags; + + // which duplicate to erase + int duplicate; + + // the key data + // + // PRO: this data can be compressed + uint8_t data[1]; + + // Returns a pointer to the key data + uint8_t *get_key_data() { + return (&data[0]); + } +} HAM_PACK_2; + +#include "1base/packstop.h" + + +#include "1base/packstart.h" + +// +// a Journal entry for a 'changeset' group +// +HAM_PACK_0 struct HAM_PACK_1 PJournalEntryChangeset { + // Constructor - sets all fields to 0 + PJournalEntryChangeset() + : num_pages(0) { + } + + // number of pages in this changeset + uint32_t num_pages; +} HAM_PACK_2; + +#include "1base/packstop.h" + + +#include "1base/packstart.h" + +// +// a Journal entry for a single page +// +HAM_PACK_0 struct HAM_PACK_1 PJournalEntryPageHeader { + // Constructor - sets all fields to 0 + PJournalEntryPageHeader(uint64_t _address = 0) + : address(_address), compressed_size(0) { + } + + // the page address + uint64_t address; + + // PRO: the compressed size, if compression is enabled + uint32_t compressed_size; +} HAM_PACK_2; + +#include "1base/packstop.h" + +} // namespace hamsterdb + +#endif /* HAM_JOURNAL_ENTRIES_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_state.h b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_state.h new file mode 100644 index 0000000000..817fcac1d5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_state.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The Journal's state + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_JOURNAL_STATE_H +#define HAM_JOURNAL_STATE_H + +#include "0root/root.h" + +#include <map> +#include <string> + +#include "ham/hamsterdb_int.h" // for metrics + +#include "1base/dynamic_array.h" +#include "1os/file.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Database; +class LocalEnvironment; + +struct JournalState +{ + enum { + // switch log file after |kSwitchTxnThreshold| transactions + kSwitchTxnThreshold = 32, + + // flush buffers if this limit is exceeded + kBufferLimit = 1024 * 1024 // 1 mb + }; + + JournalState(LocalEnvironment *env); + + // References the Environment this journal file is for + LocalEnvironment *env; + + // The index of the file descriptor we are currently writing to (0 or 1) + uint32_t current_fd; + + // The two file descriptors + File files[2]; + + // Buffers for writing data to the files + ByteArray buffer[2]; + + // For counting all open transactions in the files + size_t open_txn[2]; + + // For counting all closed transactions in the files + size_t closed_txn[2]; + + // The lsn of the previous checkpoint + uint64_t last_cp_lsn; + + // When having more than these Transactions in one file, we + // swap the files + size_t threshold; + + // Set to false to disable logging; used during recovery + bool disable_logging; + + // Counting the flushed bytes (for ham_env_get_metrics) + uint64_t count_bytes_flushed; + + // Counting the bytes before compression (for ham_env_get_metrics) + uint64_t count_bytes_before_compression; + + // Counting the bytes after compression (for ham_env_get_metrics) + uint64_t count_bytes_after_compression; + + // A map of all opened Databases + typedef std::map<uint16_t, Database *> DatabaseMap; + DatabaseMap database_map; +}; + +} // namespace hamsterdb + +#endif /* HAM_JOURNAL_STATE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_test.h b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_test.h new file mode 100644 index 0000000000..464d8fa43c --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3journal/journal_test.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Test gateway for the Journal + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_JOURNAL_TEST_H +#define HAM_JOURNAL_TEST_H + +#include "0root/root.h" + +#include "ham/hamsterdb_int.h" // for metrics + +#include "3journal/journal_state.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class JournalTest +{ + public: + JournalTest(JournalState *state) + : m_state(state) { + } + + // Returns the state + JournalState *state() { return (m_state); } + + private: + // The journal's state + JournalState *m_state; +}; + +} // namespace hamsterdb + +#endif /* HAM_JOURNAL_TEST_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager.cc b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager.cc new file mode 100644 index 0000000000..bec3cc32e0 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager.cc @@ -0,0 +1,798 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/dynamic_array.h" +#include "1base/pickle.h" +#include "2page/page.h" +#include "2device/device.h" +#include "2queue/queue.h" +#include "3page_manager/page_manager.h" +#include "3page_manager/page_manager_worker.h" +#include "3page_manager/page_manager_test.h" +#include "3btree/btree_index.h" +#include "3btree/btree_node_proxy.h" +#include "4context/context.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +enum { + kPurgeAtLeast = 20 +}; + +PageManagerState::PageManagerState(LocalEnvironment *env) + : config(env->config()), header(env->header()), + device(env->device()), lsn_manager(env->lsn_manager()), + cache(env->config()), needs_flush(false), purge_cache_pending(false), + state_page(0), last_blob_page(0), last_blob_page_id(0), + page_count_fetched(0), page_count_index(0), page_count_blob(0), + page_count_page_manager(0), cache_hits(0), cache_misses(0), + freelist_hits(0), freelist_misses(0) +{ +} + +PageManager::PageManager(LocalEnvironment *env) + : m_state(env) +{ + /* start the worker thread */ + m_worker.reset(new PageManagerWorker(&m_state.cache)); +} + +void +PageManager::initialize(uint64_t pageid) +{ + Context context(0, 0, 0); + + m_state.free_pages.clear(); + if (m_state.state_page) + delete m_state.state_page; + m_state.state_page = new Page(m_state.device); + m_state.state_page->fetch(pageid); + + Page *page = m_state.state_page; + uint32_t page_size = m_state.config.page_size_bytes; + + // the first page stores the page ID of the last blob + m_state.last_blob_page_id = *(uint64_t *)page->get_payload(); + + while (1) { + ham_assert(page->get_type() == Page::kTypePageManager); + uint8_t *p = page->get_payload(); + // skip m_state.last_blob_page_id? + if (page == m_state.state_page) + p += sizeof(uint64_t); + + // get the overflow address + uint64_t overflow = *(uint64_t *)p; + p += 8; + + // get the number of stored elements + uint32_t counter = *(uint32_t *)p; + p += 4; + + // now read all pages + for (uint32_t i = 0; i < counter; i++) { + // 4 bits page_counter, 4 bits for number of following bytes + int page_counter = (*p & 0xf0) >> 4; + int num_bytes = *p & 0x0f; + ham_assert(page_counter > 0); + ham_assert(num_bytes <= 8); + p += 1; + + uint64_t id = Pickle::decode_u64(num_bytes, p); + p += num_bytes; + + m_state.free_pages[id * page_size] = page_counter; + } + + // load the overflow page + if (overflow) + page = fetch(&context, overflow, 0); + else + break; + } +} + +Page * +PageManager::fetch(Context *context, uint64_t address, uint32_t flags) +{ + /* fetch the page from the cache */ + Page *page; + + if (address == 0) + page = m_state.header->get_header_page(); + else + page = m_state.cache.get(address); + + if (page) { + if (flags & PageManager::kNoHeader) + page->set_without_header(true); + return (safely_lock_page(context, page, true)); + } + + if ((flags & PageManager::kOnlyFromCache) + || m_state.config.flags & HAM_IN_MEMORY) + return (0); + + page = new Page(m_state.device, context->db); + try { + page->fetch(address); + } + catch (Exception &ex) { + delete page; + throw ex; + } + + ham_assert(page->get_data()); + + /* store the page in the list */ + m_state.cache.put(page); + + /* write to disk (if necessary) */ + if (!(flags & PageManager::kDisableStoreState) + && !(flags & PageManager::kReadOnly)) + maybe_store_state(context, false); + + if (flags & PageManager::kNoHeader) + page->set_without_header(true); + + m_state.page_count_fetched++; + return (safely_lock_page(context, page, false)); +} + +Page * +PageManager::alloc(Context *context, uint32_t page_type, uint32_t flags) +{ + uint64_t address = 0; + Page *page = 0; + uint32_t page_size = m_state.config.page_size_bytes; + bool allocated = false; + + /* first check the internal list for a free page */ + if ((flags & PageManager::kIgnoreFreelist) == 0 + && !m_state.free_pages.empty()) { + PageManagerState::FreeMap::iterator it = m_state.free_pages.begin(); + + address = it->first; + ham_assert(address % page_size == 0); + /* remove the page from the freelist */ + m_state.free_pages.erase(it); + m_state.needs_flush = true; + + m_state.freelist_hits++; + + /* try to fetch the page from the cache */ + page = m_state.cache.get(address); + if (page) + goto done; + /* allocate a new page structure and read the page from disk */ + page = new Page(m_state.device, context->db); + page->fetch(address); + goto done; + } + + m_state.freelist_misses++; + + try { + if (!page) { + allocated = true; + page = new Page(m_state.device, context->db); + } + + page->alloc(page_type); + } + catch (Exception &ex) { + if (allocated) + delete page; + throw ex; + } + +done: + /* clear the page with zeroes? */ + if (flags & PageManager::kClearWithZero) + memset(page->get_data(), 0, page_size); + + /* initialize the page; also set the 'dirty' flag to force logging */ + page->set_type(page_type); + page->set_dirty(true); + page->set_db(context->db); + + if (page->get_node_proxy()) { + delete page->get_node_proxy(); + page->set_node_proxy(0); + } + + /* store the page in the cache and the Changeset */ + m_state.cache.put(page); + safely_lock_page(context, page, false); + + /* write to disk (if necessary) */ + if (!(flags & PageManager::kDisableStoreState) + && !(flags & PageManager::kReadOnly)) + maybe_store_state(context, false); + + switch (page_type) { + case Page::kTypeBindex: + case Page::kTypeBroot: { + memset(page->get_payload(), 0, sizeof(PBtreeNode)); + m_state.page_count_index++; + break; + } + case Page::kTypePageManager: + m_state.page_count_page_manager++; + break; + case Page::kTypeBlob: + m_state.page_count_blob++; + break; + default: + break; + } + + return (page); +} + +Page * +PageManager::alloc_multiple_blob_pages(Context *context, size_t num_pages) +{ + // allocate only one page? then use the normal ::alloc() method + if (num_pages == 1) + return (alloc(context, Page::kTypeBlob, 0)); + + Page *page = 0; + uint32_t page_size = m_state.config.page_size_bytes; + + // Now check the freelist + if (!m_state.free_pages.empty()) { + for (PageManagerState::FreeMap::iterator it = m_state.free_pages.begin(); + it != m_state.free_pages.end(); + it++) { + if (it->second >= num_pages) { + for (size_t i = 0; i < num_pages; i++) { + if (i == 0) { + page = fetch(context, it->first, 0); + page->set_type(Page::kTypeBlob); + page->set_without_header(false); + } + else { + Page *p = fetch(context, it->first + (i * page_size), 0); + p->set_type(Page::kTypeBlob); + p->set_without_header(true); + } + } + if (it->second > num_pages) { + m_state.free_pages[it->first + num_pages * page_size] + = it->second - num_pages; + } + m_state.free_pages.erase(it); + return (page); + } + } + } + + // Freelist lookup was not successful -> allocate new pages. Only the first + // page is a regular page; all others do not have page headers. + // + // disable "store state": the PageManager otherwise could alloc overflow + // pages in the middle of our blob sequence. + uint32_t flags = PageManager::kIgnoreFreelist + | PageManager::kDisableStoreState; + for (size_t i = 0; i < num_pages; i++) { + if (page == 0) + page = alloc(context, Page::kTypeBlob, flags); + else { + Page *p = alloc(context, Page::kTypeBlob, flags); + p->set_without_header(true); + } + } + + // now store the state + maybe_store_state(context, false); + return (page); +} + +void +PageManager::fill_metrics(ham_env_metrics_t *metrics) const +{ + metrics->page_count_fetched = m_state.page_count_fetched; + metrics->page_count_flushed = Page::ms_page_count_flushed; + metrics->page_count_type_index = m_state.page_count_index; + metrics->page_count_type_blob = m_state.page_count_blob; + metrics->page_count_type_page_manager = m_state.page_count_page_manager; + metrics->freelist_hits = m_state.freelist_hits; + metrics->freelist_misses = m_state.freelist_misses; + m_state.cache.fill_metrics(metrics); +} + +struct FlushAllPagesPurger +{ + FlushAllPagesPurger(bool delete_pages) + : delete_pages(delete_pages) { + } + + bool operator()(Page *page) { + ScopedSpinlock lock(page->mutex()); + page->flush(); + return (delete_pages); + } + + bool delete_pages; +}; + +void +PageManager::flush(bool delete_pages) +{ + FlushAllPagesPurger purger(delete_pages); + m_state.cache.purge_if(purger); + + if (m_state.state_page) { + ScopedSpinlock lock(m_state.state_page->mutex()); + m_state.state_page->flush(); + } +} + +// Returns true if the page can be purged: page must use allocated +// memory instead of an mmapped pointer; page must not be in use (= in +// a changeset) and not have cursors attached +struct PurgeProcessor +{ + PurgeProcessor(Page *last_blob_page, FlushPageMessage *message) + : last_blob_page(last_blob_page), message(message) { + } + + bool operator()(Page *page) { + // the lock in here will be unlocked by the worker thread + if (page == last_blob_page || !page->mutex().try_lock()) + return (false); + message->list.push_back(page); + return (true); + } + + Page *last_blob_page; + FlushPageMessage *message; +}; + +void +PageManager::purge_cache(Context *context) +{ + // do NOT purge the cache iff + // 1. this is an in-memory Environment + // 2. there's still a "purge cache" operation pending + // 3. the cache is not full + if (m_state.config.flags & HAM_IN_MEMORY + || m_state.purge_cache_pending + || !m_state.cache.is_cache_full()) + return; + + // Purge as many pages as possible to get memory usage down to the + // cache's limit. + FlushPageMessage *message = new FlushPageMessage(); + PurgeProcessor processor(m_state.last_blob_page, message); + m_state.cache.purge(processor, m_state.last_blob_page); + + if (message->list.size()) + m_worker->add_to_queue(message); + else + delete message; +} + +void +PageManager::reclaim_space(Context *context) +{ + if (m_state.last_blob_page) { + m_state.last_blob_page_id = m_state.last_blob_page->get_address(); + m_state.last_blob_page = 0; + } + ham_assert(!(m_state.config.flags & HAM_DISABLE_RECLAIM_INTERNAL)); + + bool do_truncate = false; + size_t file_size = m_state.device->file_size(); + uint32_t page_size = m_state.config.page_size_bytes; + + while (m_state.free_pages.size() > 1) { + PageManagerState::FreeMap::iterator fit = + m_state.free_pages.find(file_size - page_size); + if (fit != m_state.free_pages.end()) { + Page *page = m_state.cache.get(fit->first); + if (page) { + m_state.cache.del(page); + delete page; + } + file_size -= page_size; + do_truncate = true; + m_state.free_pages.erase(fit); + continue; + } + break; + } + + if (do_truncate) { + m_state.needs_flush = true; + maybe_store_state(context, true); + m_state.device->truncate(file_size); + } +} + +struct DbClosePurger +{ + DbClosePurger(LocalDatabase *db) + : m_db(db) { + } + + bool operator()(Page *page) { + if (page->get_db() == m_db && page->get_address() != 0) { + ScopedSpinlock lock(page->mutex()); + ham_assert(page->cursor_list() == 0); + page->flush(); + return (true); + } + return (false); + } + + LocalDatabase *m_db; +}; + +void +PageManager::close_database(Context *context, LocalDatabase *db) +{ + if (m_state.last_blob_page) { + m_state.last_blob_page_id = m_state.last_blob_page->get_address(); + m_state.last_blob_page = 0; + } + + context->changeset.clear(); + + DbClosePurger purger(db); + m_state.cache.purge_if(purger); +} + +void +PageManager::del(Context *context, Page *page, size_t page_count) +{ + ham_assert(page_count > 0); + + if (m_state.config.flags & HAM_IN_MEMORY) + return; + + // remove all pages from the changeset, otherwise they won't be unlocked + context->changeset.del(page); + if (page_count > 1) { + uint32_t page_size = m_state.config.page_size_bytes; + for (size_t i = 1; i < page_count; i++) { + Page *p = m_state.cache.get(page->get_address() + i * page_size); + if (p && context->changeset.has(p)) + context->changeset.del(p); + } + } + + m_state.needs_flush = true; + m_state.free_pages[page->get_address()] = page_count; + ham_assert(page->get_address() % m_state.config.page_size_bytes == 0); + + if (page->get_node_proxy()) { + delete page->get_node_proxy(); + page->set_node_proxy(0); + } + + // do not call maybe_store_state() - this change in the m_state is not + // relevant for logging. +} + +void +PageManager::reset(Context *context) +{ + close(context); + + /* start the worker thread */ + m_worker.reset(new PageManagerWorker(&m_state.cache)); +} + +void +PageManager::close(Context *context) +{ + /* wait for the worker thread to stop */ + if (m_worker.get()) + m_worker->stop_and_join(); + + // store the state of the PageManager + if ((m_state.config.flags & HAM_IN_MEMORY) == 0 + && (m_state.config.flags & HAM_READ_ONLY) == 0) { + maybe_store_state(context, true); + } + + // reclaim unused disk space + // if logging is enabled: also flush the changeset to write back the + // modified freelist pages + bool try_reclaim = m_state.config.flags & HAM_DISABLE_RECLAIM_INTERNAL + ? false + : true; + +#ifdef WIN32 + // Win32: it's not possible to truncate the file while there's an active + // mapping, therefore only reclaim if memory mapped I/O is disabled + if (!(m_state.config.flags & HAM_DISABLE_MMAP)) + try_reclaim = false; +#endif + + if (try_reclaim) { + reclaim_space(context); + } + + // clear the Changeset because flush() will delete all Page pointers + context->changeset.clear(); + + // flush all dirty pages to disk, then delete them + flush(true); + + delete m_state.state_page; + m_state.state_page = 0; + m_state.last_blob_page = 0; +} + +Page * +PageManager::get_last_blob_page(Context *context) +{ + if (m_state.last_blob_page) + return (safely_lock_page(context, m_state.last_blob_page, true)); + if (m_state.last_blob_page_id) + return (fetch(context, m_state.last_blob_page_id, 0)); + return (0); +} + +void +PageManager::set_last_blob_page(Page *page) +{ + m_state.last_blob_page_id = 0; + m_state.last_blob_page = page; +} + +uint64_t +PageManager::store_state(Context *context) +{ + // no modifications? then simply return the old blobid + if (!m_state.needs_flush) + return (m_state.state_page ? m_state.state_page->get_address() : 0); + + m_state.needs_flush = false; + + // no freelist pages, no freelist state? then don't store anything + if (!m_state.state_page && m_state.free_pages.empty()) + return (0); + + // otherwise allocate a new page, if required + if (!m_state.state_page) { + m_state.state_page = new Page(m_state.device); + m_state.state_page->alloc(Page::kTypePageManager, + Page::kInitializeWithZeroes); + } + + // don't bother locking the state page + context->changeset.put(m_state.state_page); + + uint32_t page_size = m_state.config.page_size_bytes; + + // make sure that the page is logged + Page *page = m_state.state_page; + page->set_dirty(true); + + uint8_t *p = page->get_payload(); + + // store page-ID of the last allocated blob + *(uint64_t *)p = m_state.last_blob_page_id; + p += sizeof(uint64_t); + + // reset the overflow pointer and the counter + // TODO here we lose a whole chain of overflow pointers if there was such + // a chain. We only save the first. That's not critical but also not nice. + uint64_t next_pageid = *(uint64_t *)p; + if (next_pageid) { + m_state.free_pages[next_pageid] = 1; + ham_assert(next_pageid % page_size == 0); + } + + // No freelist entries? then we're done. Make sure that there's no + // overflow pointer or other garbage in the page! + if (m_state.free_pages.empty()) { + *(uint64_t *)p = 0; + p += sizeof(uint64_t); + *(uint32_t *)p = 0; + return (m_state.state_page->get_address()); + } + + PageManagerState::FreeMap::const_iterator it = m_state.free_pages.begin(); + while (it != m_state.free_pages.end()) { + // this is where we will store the data + p = page->get_payload(); + // skip m_state.last_blob_page_id? + if (page == m_state.state_page) + p += sizeof(uint64_t); + p += 8; // leave room for the pointer to the next page + p += 4; // leave room for the counter + + uint32_t counter = 0; + + while (it != m_state.free_pages.end()) { + // 9 bytes is the maximum amount of storage that we will need for a + // new entry; if it does not fit then break + if ((p + 9) - page->get_payload() + >= (ptrdiff_t)(m_state.config.page_size_bytes + - Page::kSizeofPersistentHeader)) + break; + + // ... and check if the next entry (and the following) are directly + // next to the current page + uint32_t page_counter = 1; + uint64_t base = it->first; + ham_assert(base % page_size == 0); + uint64_t current = it->first; + + // move to the next entry + it++; + + for (; it != m_state.free_pages.end() && page_counter < 16 - 1; it++) { + if (it->first != current + page_size) + break; + current += page_size; + page_counter++; + } + + // now |base| is the start of a sequence of free pages, and the + // sequence has |page_counter| pages + // + // This is encoded as + // - 1 byte header + // - 4 bits for |page_counter| + // - 4 bits for the number of bytes following ("n") + // - n byte page-id (div page_size) + ham_assert(page_counter < 16); + int num_bytes = Pickle::encode_u64(p + 1, base / page_size); + *p = (page_counter << 4) | num_bytes; + p += 1 + num_bytes; + + counter++; + } + + p = page->get_payload(); + if (page == m_state.state_page) // skip m_state.last_blob_page_id? + p += sizeof(uint64_t); + uint64_t next_pageid = *(uint64_t *)p; + *(uint64_t *)p = 0; + p += 8; // overflow page + + // now store the counter + *(uint32_t *)p = counter; + + // are we done? if not then continue with the next page + if (it != m_state.free_pages.end()) { + // allocate (or fetch) an overflow page + if (!next_pageid) { + Page *new_page = alloc(context, Page::kTypePageManager, + PageManager::kIgnoreFreelist); + // patch the overflow pointer in the old (current) page + p = page->get_payload(); + if (page == m_state.state_page) // skip m_state.last_blob_page_id? + p += sizeof(uint64_t); + *(uint64_t *)p = new_page->get_address(); + + // reset the overflow pointer in the new page + page = new_page; + p = page->get_payload(); + *(uint64_t *)p = 0; + } + else + page = fetch(context, next_pageid, 0); + + // make sure that the page is logged + page->set_dirty(true); + } + } + + return (m_state.state_page->get_address()); +} + +void +PageManager::maybe_store_state(Context *context, bool force) +{ + if (force || (m_state.config.flags & HAM_ENABLE_RECOVERY)) { + uint64_t new_blobid = store_state(context); + if (new_blobid != m_state.header->get_page_manager_blobid()) { + m_state.header->set_page_manager_blobid(new_blobid); + // don't bother to lock the header page + m_state.header->get_header_page()->set_dirty(true); + context->changeset.put(m_state.header->get_header_page()); + } + } +} + +Page * +PageManager::safely_lock_page(Context *context, Page *page, + bool allow_recursive_lock) +{ + context->changeset.put(page); + + ham_assert(page->mutex().try_lock() == false); + + // fetch contents again? + if (!page->get_data()) { + page->fetch(page->get_address()); + } + + return (page); +} + +PageManagerTest +PageManager::test() +{ + return (PageManagerTest(this)); +} + +PageManagerTest::PageManagerTest(PageManager *page_manager) + : m_sut(page_manager) +{ +} + +uint64_t +PageManagerTest::store_state() +{ + Context context(0, 0, 0); + return (m_sut->store_state(&context)); +} + +void +PageManagerTest::remove_page(Page *page) +{ + m_sut->m_state.cache.del(page); +} + +bool +PageManagerTest::is_page_free(uint64_t pageid) +{ + return (m_sut->m_state.free_pages.find(pageid) + != m_sut->m_state.free_pages.end()); +} + +Page * +PageManagerTest::fetch_page(uint64_t id) +{ + return (m_sut->m_state.cache.get(id)); +} + +void +PageManagerTest::store_page(Page *page) +{ + m_sut->m_state.cache.put(page); +} + +bool +PageManagerTest::is_cache_full() +{ + return (m_sut->m_state.cache.is_cache_full()); +} + +PageManagerState * +PageManagerTest::state() +{ + return (&m_sut->m_state); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager.h b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager.h new file mode 100644 index 0000000000..a6593e39ae --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager.h @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The PageManager allocates, fetches and frees pages. It manages the + * list of all pages (free and not free), and maps their virtual ID to + * their physical address in the file. + * + * @exception_safe: basic + * @thread_safe: no + */ + +#ifndef HAM_PAGE_MANAGER_H +#define HAM_PAGE_MANAGER_H + +#include "0root/root.h" + +#include <map> + +// Always verify that a file of level N does not include headers > N! +#include "1base/scoped_ptr.h" +#include "3page_manager/page_manager_state.h" +#include "3page_manager/page_manager_test.h" +#include "3page_manager/page_manager_worker.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class LocalDatabase; +class LocalEnvironment; + +class PageManager +{ + public: + enum { + // flag for alloc(): Clear the full page with zeroes + kClearWithZero = 1, + + // flag for alloc(): Ignores the freelist + kIgnoreFreelist = 2, + + // flag for alloc(): Do not persist the PageManager state to disk + kDisableStoreState = 4, + + // Flag for fetch(): only fetches from cache, not from disk + kOnlyFromCache = 1, + + // Flag for fetch(): does not add page to the Changeset + kReadOnly = 2, + + // Flag for fetch(): page is part of a multi-page blob, has no header + kNoHeader = 4 + }; + + // Constructor + PageManager(LocalEnvironment *env); + + // Loads the state from a blob + void initialize(uint64_t blobid); + + // Fills in the current metrics for the PageManager, the Cache and the + // Freelist + void fill_metrics(ham_env_metrics_t *metrics) const; + + // Fetches a page from disk. |flags| are bitwise OR'd: kOnlyFromCache, + // kReadOnly, kNoHeader... + // The page is locked and stored in |context->changeset|. + Page *fetch(Context *context, uint64_t address, uint32_t flags = 0); + + // Allocates a new page. |page_type| is one of Page::kType* in page.h. + // |flags| are either 0 or kClearWithZero + // The page is locked and stored in |context->changeset|. + Page *alloc(Context *context, uint32_t page_type, uint32_t flags = 0); + + // Allocates multiple adjacent pages. + // Used by the BlobManager to store blobs that span multiple pages + // Returns the first page in the list of pages + // The pages are locked and stored in |context->changeset|. + Page *alloc_multiple_blob_pages(Context *context, size_t num_pages); + + // Flushes all pages to disk and deletes them if |delete_pages| is true + void flush(bool delete_pages); + + // Asks the worker thread to purge the cache if the cache limits are + // exceeded + void purge_cache(Context *context); + + // Reclaim file space; truncates unused file space at the end of the file. + void reclaim_space(Context *context); + + // Flushes and closes all pages of a database + void close_database(Context *context, LocalDatabase *db); + + // Schedules one (or many sequential) pages for deletion and adds them + // to the Freelist. Will not do anything if the Environment is in-memory. + void del(Context *context, Page *page, size_t page_count = 1); + + // Resets the PageManager; calls clear(), then starts a new worker thread + void reset(Context *context); + + // Closes the PageManager; flushes all dirty pages + void close(Context *context); + + // Returns the Page pointer where we can add more blobs + Page *get_last_blob_page(Context *context); + + // Sets the Page pointer where we can add more blobs + void set_last_blob_page(Page *page); + + // Returns additional testing interfaces + PageManagerTest test(); + + private: + friend struct Purger; + friend class PageManagerTest; + friend class PageManagerWorker; + + // Persists the PageManager's state in the file + uint64_t store_state(Context *context); + + // Calls store_state() whenever it makes sense + void maybe_store_state(Context *context, bool force); + + // Locks a page, fetches contents from disk if they were flushed in + // the meantime + Page *safely_lock_page(Context *context, Page *page, + bool allow_recursive_lock); + + // The worker thread which flushes dirty pages + ScopedPtr<PageManagerWorker> m_worker; + + // The state + PageManagerState m_state; +}; + +} // namespace hamsterdb + +#endif /* HAM_PAGE_MANAGER_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_state.h b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_state.h new file mode 100644 index 0000000000..dc02b02b79 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_state.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The PageManager allocates, fetches and frees pages. It manages the + * list of all pages (free and not free), and maps their virtual ID to + * their physical address in the file. + * + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_PAGE_MANAGER_STATE_H +#define HAM_PAGE_MANAGER_STATE_H + +#include "0root/root.h" + +#include <map> +#include <boost/atomic.hpp> + +// Always verify that a file of level N does not include headers > N! +#include "2config/env_config.h" +#include "3cache/cache.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Device; +class EnvironmentHeader; +class LocalDatabase; +class LocalEnvironment; +class LsnManager; + +/* + * The internal state of the PageManager + */ +struct PageManagerState +{ + // The freelist maps page-id to number of free pages (usually 1) + typedef std::map<uint64_t, size_t> FreeMap; + + PageManagerState(LocalEnvironment *env); + + // Copy of the Environment's configuration + const EnvironmentConfiguration config; + + // The Environment's header + EnvironmentHeader *header; + + // The Device + Device *device; + + // The lsn manager + LsnManager *lsn_manager; + + // The cache + Cache cache; + + // The map with free pages + FreeMap free_pages; + + // Whether |m_free_pages| must be flushed or not + bool needs_flush; + + // Whether a "purge cache" operation is pending + boost::atomic<bool> purge_cache_pending; + + // Page with the persisted state data. If multiple pages are allocated + // then these pages form a linked list, with |m_state_page| being the head + Page *state_page; + + // Cached page where to add more blobs + Page *last_blob_page; + + // Page where to add more blobs - if |m_last_blob_page| was flushed + uint64_t last_blob_page_id; + + // tracks number of fetched pages + uint64_t page_count_fetched; + + // tracks number of index pages + uint64_t page_count_index; + + // tracks number of blob pages + uint64_t page_count_blob; + + // tracks number of page manager pages + uint64_t page_count_page_manager; + + // tracks number of cache hits + uint64_t cache_hits; + + // tracks number of cache misses + uint64_t cache_misses; + + // number of successful freelist hits + uint64_t freelist_hits; + + // number of freelist misses + uint64_t freelist_misses; +}; + +} // namespace hamsterdb + +#endif /* HAM_PAGE_MANAGER_STATE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_test.h b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_test.h new file mode 100644 index 0000000000..741cbc8390 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_test.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A test gateway for the PageManager + * + * @exception_safe: no + * @thread_safe: no + */ + +#ifndef HAM_PAGE_MANAGER_TEST_H +#define HAM_PAGE_MANAGER_TEST_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3page_manager/page_manager_state.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Page; +class PageManager; + +class PageManagerTest +{ + public: + // Constructor + PageManagerTest(PageManager *page_manager); + + // Stores the local PageManager state to disk; returns the blob id + uint64_t store_state(); + + // Removes a page from the list; only for testing. + void remove_page(Page *page); + + // Returns true if a page is free. Ignores multi-pages; only for + // testing and integrity checks + bool is_page_free(uint64_t pageid); + + // Fetches a page from the cache + Page *fetch_page(uint64_t id); + + // Stores a page in the cache + void store_page(Page *page); + + // Returns true if the cache is full + bool is_cache_full(); + + // Returns the state + PageManagerState *state(); + + private: + // Reference of the PageManager instance + PageManager *m_sut; +}; + +} // namespace hamsterdb + +#endif /* HAM_PAGE_MANAGER_TEST_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_worker.h b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_worker.h new file mode 100644 index 0000000000..2a66189765 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/3page_manager/page_manager_worker.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The worker thread for the PageManager + */ + +#ifndef HAM_PAGE_MANAGER_WORKER_H +#define HAM_PAGE_MANAGER_WORKER_H + +#include "0root/root.h" + +#include <vector> +#include <boost/thread.hpp> +#include <boost/atomic.hpp> + +// Always verify that a file of level N does not include headers > N! +#include "2device/device.h" +#include "2queue/queue.h" +#include "2worker/worker.h" +#include "3cache/cache.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct FlushPageMessage : public MessageBase +{ + // The available message types + enum { + kFlushPage = 1, + }; + + FlushPageMessage() + : MessageBase(kFlushPage, 0) { + } + + std::vector<Page *> list; +}; + + +class PageManagerWorker : public Worker +{ + public: + PageManagerWorker(Cache *cache) + : Worker(), m_cache(cache) { + } + + private: + virtual void handle_message(MessageBase *message) { + switch (message->type) { + case FlushPageMessage::kFlushPage: { + FlushPageMessage *fpm = (FlushPageMessage *)message; + for (std::vector<Page *>::iterator it = fpm->list.begin(); + it != fpm->list.end(); + ++it) { + Page *page = *it; + ham_assert(page != 0); + ham_assert(page->mutex().try_lock() == false); + try { + page->flush(); + } + catch (Exception &) { + page->mutex().unlock(); + throw; + } + page->mutex().unlock(); + } + break; + } + default: + ham_assert(!"shouldn't be here"); + } + } + + // The PageManager's cache + Cache *m_cache; +}; + +} // namespace hamsterdb + +#endif // HAM_PAGE_MANAGER_WORKER_H diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4context/context.h b/plugins/Dbx_kv/src/hamsterdb/src/4context/context.h new file mode 100644 index 0000000000..7a88aa211e --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4context/context.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_CONTEXT_H +#define HAM_CONTEXT_H + +#include "0root/root.h" + +#include "3changeset/changeset.h" + +namespace hamsterdb { + +class Cursor; +class LocalDatabase; +class LocalEnvironment; +class LocalTransaction; + +struct Context +{ + Context(LocalEnvironment *env, LocalTransaction *txn = 0, + LocalDatabase *db = 0) + : env(env), txn(txn), db(db), changeset(env) { + } + + ~Context() { + changeset.clear(); + } + + LocalEnvironment *env; + LocalTransaction *txn; + LocalDatabase *db; + + // Each operation has its own changeset which stores all locked pages + Changeset changeset; +}; + +} // namespace hamsterdb + +#endif /* HAM_CONTEXT_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4cursor/cursor.cc b/plugins/Dbx_kv/src/hamsterdb/src/4cursor/cursor.cc new file mode 100644 index 0000000000..57cc80a6f6 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4cursor/cursor.cc @@ -0,0 +1,1119 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "3btree/btree_cursor.h" +#include "3btree/btree_index.h" +#include "3btree/btree_node_proxy.h" +#include "4cursor/cursor.h" +#include "4env/env_local.h" +#include "4txn/txn_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +Cursor::Cursor(LocalDatabase *db, Transaction *txn, uint32_t flags) + : m_db(db), m_txn(txn), m_txn_cursor(this), m_btree_cursor(this), + m_remote_handle(0), m_next(0), m_previous(0), m_dupecache_index(0), + m_lastop(0), m_last_cmp(0), m_flags(flags), m_is_first_use(true) +{ +} + +Cursor::Cursor(Cursor &other) + : m_db(other.m_db), m_txn_cursor(this), m_btree_cursor(this) +{ + m_txn = other.m_txn; + m_remote_handle = other.m_remote_handle; + m_next = other.m_next; + m_previous = other.m_previous; + m_dupecache_index = other.m_dupecache_index; + m_lastop = other.m_lastop; + m_last_cmp = other.m_last_cmp; + m_flags = other.m_flags; + m_is_first_use = other.m_is_first_use; + + m_btree_cursor.clone(&other.m_btree_cursor); + m_txn_cursor.clone(&other.m_txn_cursor); + + if (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) + other.m_dupecache.clone(&m_dupecache); +} + +void +Cursor::append_btree_duplicates(Context *context, BtreeCursor *btc, + DupeCache *dc) +{ + uint32_t count = btc->get_record_count(context, 0); + for (uint32_t i = 0; i < count; i++) + dc->append(DupeCacheLine(true, i)); +} + +void +Cursor::update_dupecache(Context *context, uint32_t what) +{ + if (!(m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS)) + return; + + /* if the cache already exists: no need to continue, it should be + * up to date */ + if (m_dupecache.get_count() != 0) + return; + + if ((what & kBtree) && (what & kTxn)) { + if (is_nil(kBtree) && !is_nil(kTxn)) { + bool equal_keys; + sync(context, 0, &equal_keys); + if (!equal_keys) + set_to_nil(kBtree); + } + } + + /* first collect all duplicates from the btree. They're already sorted, + * therefore we can just append them to our duplicate-cache. */ + if ((what & kBtree) && !is_nil(kBtree)) + append_btree_duplicates(context, &m_btree_cursor, &m_dupecache); + + /* read duplicates from the txn-cursor? */ + if ((what & kTxn) && !is_nil(kTxn)) { + TransactionOperation *op = m_txn_cursor.get_coupled_op(); + TransactionNode *node = op->get_node(); + + if (!node) + return; + + /* now start integrating the items from the transactions */ + op = node->get_oldest_op(); + while (op) { + Transaction *optxn = op->get_txn(); + /* collect all ops that are valid (even those that are + * from conflicting transactions) */ + if (!optxn->is_aborted()) { + /* a normal (overwriting) insert will overwrite ALL dupes, + * but an overwrite of a duplicate will only overwrite + * an entry in the dupecache */ + if (op->get_flags() & TransactionOperation::kInsert) { + /* all existing dupes are overwritten */ + m_dupecache.clear(); + m_dupecache.append(DupeCacheLine(false, op)); + } + else if (op->get_flags() & TransactionOperation::kInsertOverwrite) { + uint32_t ref = op->get_referenced_dupe(); + if (ref) { + ham_assert(ref <= m_dupecache.get_count()); + DupeCacheLine *e = m_dupecache.get_first_element(); + (&e[ref - 1])->set_txn_op(op); + } + else { + /* all existing dupes are overwritten */ + m_dupecache.clear(); + m_dupecache.append(DupeCacheLine(false, op)); + } + } + /* insert a duplicate key */ + else if (op->get_flags() & TransactionOperation::kInsertDuplicate) { + uint32_t of = op->get_orig_flags(); + uint32_t ref = op->get_referenced_dupe() - 1; + DupeCacheLine dcl(false, op); + if (of & HAM_DUPLICATE_INSERT_FIRST) + m_dupecache.insert(0, dcl); + else if (of & HAM_DUPLICATE_INSERT_BEFORE) { + m_dupecache.insert(ref, dcl); + } + else if (of & HAM_DUPLICATE_INSERT_AFTER) { + if (ref + 1 >= m_dupecache.get_count()) + m_dupecache.append(dcl); + else + m_dupecache.insert(ref + 1, dcl); + } + else /* default is HAM_DUPLICATE_INSERT_LAST */ + m_dupecache.append(dcl); + } + /* a normal erase will erase ALL duplicate keys */ + else if (op->get_flags() & TransactionOperation::kErase) { + uint32_t ref = op->get_referenced_dupe(); + if (ref) { + ham_assert(ref <= m_dupecache.get_count()); + m_dupecache.erase(ref - 1); + } + else { + /* all existing dupes are erased */ + m_dupecache.clear(); + } + } + else { + /* everything else is a bug! */ + ham_assert(op->get_flags() == TransactionOperation::kNop); + } + } + + /* continue with the previous/older operation */ + op = op->get_next_in_node(); + } + } +} + +void +Cursor::couple_to_dupe(uint32_t dupe_id) +{ + DupeCacheLine *e = 0; + + ham_assert(m_dupecache.get_count() >= dupe_id); + ham_assert(dupe_id >= 1); + + /* dupe-id is a 1-based index! */ + e = m_dupecache.get_element(dupe_id - 1); + if (e->use_btree()) { + couple_to_btree(); + m_btree_cursor.set_duplicate_index((uint32_t)e->get_btree_dupe_idx()); + } + else { + ham_assert(e->get_txn_op() != 0); + m_txn_cursor.couple_to_op(e->get_txn_op()); + couple_to_txnop(); + } + set_dupecache_index(dupe_id); +} + +ham_status_t +Cursor::check_if_btree_key_is_erased_or_overwritten(Context *context) +{ + ham_key_t key = {0}; + TransactionOperation *op; + // TODO not threadsafe - will leak if an exception is thrown + Cursor *clone = get_db()->cursor_clone_impl(this); + + ham_status_t st = m_btree_cursor.move(context, &key, + &get_db()->key_arena(get_txn()), 0, 0, 0); + if (st) { + get_db()->cursor_close(clone); + return (st); + } + + st = clone->m_txn_cursor.find(&key, 0); + if (st) { + get_db()->cursor_close_impl(clone); + delete clone; + return (st); + } + + op = clone->m_txn_cursor.get_coupled_op(); + if (op->get_flags() & TransactionOperation::kInsertDuplicate) + st = HAM_KEY_NOT_FOUND; + get_db()->cursor_close_impl(clone); + delete clone; + return (st); +} + +void +Cursor::sync(Context *context, uint32_t flags, bool *equal_keys) +{ + if (equal_keys) + *equal_keys = false; + + if (is_nil(kBtree)) { + if (!m_txn_cursor.get_coupled_op()) + return; + ham_key_t *key = m_txn_cursor.get_coupled_op()->get_node()->get_key(); + + if (!(flags & kSyncOnlyEqualKeys)) + flags = flags | ((flags & HAM_CURSOR_NEXT) + ? HAM_FIND_GEQ_MATCH + : HAM_FIND_LEQ_MATCH); + /* the flag |kSyncDontLoadKey| does not load the key if there's an + * approx match - it only positions the cursor */ + ham_status_t st = m_btree_cursor.find(context, key, 0, 0, 0, + kSyncDontLoadKey | flags); + /* if we had a direct hit instead of an approx. match then + * set |equal_keys| to false; otherwise Cursor::move() + * will move the btree cursor again */ + if (st == 0 && equal_keys && !ham_key_get_approximate_match_type(key)) + *equal_keys = true; + } + else if (is_nil(kTxn)) { + // TODO not threadsafe - will leak if an exception is thrown + Cursor *clone = get_db()->cursor_clone_impl(this); + clone->m_btree_cursor.uncouple_from_page(context); + ham_key_t *key = clone->m_btree_cursor.get_uncoupled_key(); + if (!(flags & kSyncOnlyEqualKeys)) + flags = flags | ((flags & HAM_CURSOR_NEXT) + ? HAM_FIND_GEQ_MATCH + : HAM_FIND_LEQ_MATCH); + + ham_status_t st = m_txn_cursor.find(key, kSyncDontLoadKey | flags); + /* if we had a direct hit instead of an approx. match then + * set |equal_keys| to false; otherwise Cursor::move() + * will move the btree cursor again */ + if (st == 0 && equal_keys && !ham_key_get_approximate_match_type(key)) + *equal_keys = true; + get_db()->cursor_close_impl(clone); + delete clone; + } +} + +ham_status_t +Cursor::move_next_dupe(Context *context) +{ + if (get_dupecache_index()) { + if (get_dupecache_index() < m_dupecache.get_count()) { + set_dupecache_index(get_dupecache_index() + 1); + couple_to_dupe(get_dupecache_index()); + return (0); + } + } + return (HAM_LIMITS_REACHED); +} + +ham_status_t +Cursor::move_previous_dupe(Context *context) +{ + if (get_dupecache_index()) { + if (get_dupecache_index() > 1) { + set_dupecache_index(get_dupecache_index() - 1); + couple_to_dupe(get_dupecache_index()); + return (0); + } + } + return (HAM_LIMITS_REACHED); +} + +ham_status_t +Cursor::move_first_dupe(Context *context) +{ + if (m_dupecache.get_count()) { + set_dupecache_index(1); + couple_to_dupe(get_dupecache_index()); + return (0); + } + return (HAM_LIMITS_REACHED); +} + +ham_status_t +Cursor::move_last_dupe(Context *context) +{ + if (m_dupecache.get_count()) { + set_dupecache_index(m_dupecache.get_count()); + couple_to_dupe(get_dupecache_index()); + return (0); + } + return (HAM_LIMITS_REACHED); +} + +static bool +__txn_cursor_is_erase(TransactionCursor *txnc) +{ + TransactionOperation *op = txnc->get_coupled_op(); + return (op + ? (op->get_flags() & TransactionOperation::kErase) != 0 + : false); +} + +int +Cursor::compare(Context *context) +{ + BtreeCursor *btrc = get_btree_cursor(); + BtreeIndex *btree = get_db()->btree_index(); + + TransactionNode *node = m_txn_cursor.get_coupled_op()->get_node(); + ham_key_t *txnk = node->get_key(); + + ham_assert(!is_nil(0)); + ham_assert(!m_txn_cursor.is_nil()); + + if (btrc->get_state() == BtreeCursor::kStateCoupled) { + Page *page; + int slot; + btrc->get_coupled_key(&page, &slot, 0); + m_last_cmp = btree->get_node_from_page(page)->compare(context, txnk, slot); + + // need to fix the sort order - we compare txnk vs page[slot], but the + // caller expects m_last_cmp to be the comparison of page[slot] vs txnk + if (m_last_cmp < 0) + m_last_cmp = +1; + else if (m_last_cmp > 0) + m_last_cmp = -1; + + return (m_last_cmp); + } + else if (btrc->get_state() == BtreeCursor::kStateUncoupled) { + m_last_cmp = btree->compare_keys(btrc->get_uncoupled_key(), txnk); + return (m_last_cmp); + } + + ham_assert(!"shouldn't be here"); + return (0); +} + +ham_status_t +Cursor::move_next_key_singlestep(Context *context) +{ + ham_status_t st = 0; + BtreeCursor *btrc = get_btree_cursor(); + + /* if both cursors point to the same key: move next with both */ + if (m_last_cmp == 0) { + if (!is_nil(kBtree)) { + st = btrc->move(context, 0, 0, 0, 0, + HAM_CURSOR_NEXT | HAM_SKIP_DUPLICATES); + if (st == HAM_KEY_NOT_FOUND || st == HAM_CURSOR_IS_NIL) { + set_to_nil(kBtree); // TODO muss raus + if (m_txn_cursor.is_nil()) + return (HAM_KEY_NOT_FOUND); + else { + couple_to_txnop(); + m_last_cmp = 1; + } + } + } + if (!m_txn_cursor.is_nil()) { + st = m_txn_cursor.move(HAM_CURSOR_NEXT); + if (st == HAM_KEY_NOT_FOUND || st==HAM_CURSOR_IS_NIL) { + set_to_nil(kTxn); // TODO muss raus + if (is_nil(kBtree)) + return (HAM_KEY_NOT_FOUND); + else { + couple_to_btree(); + m_last_cmp = -1; + + ham_status_t st2 = check_if_btree_key_is_erased_or_overwritten(context); + if (st2 == HAM_TXN_CONFLICT) + st = st2; + } + } + } + } + /* if the btree-key is smaller: move it next */ + else if (m_last_cmp < 0) { + st = btrc->move(context, 0, 0, 0, 0, HAM_CURSOR_NEXT | HAM_SKIP_DUPLICATES); + if (st == HAM_KEY_NOT_FOUND) { + set_to_nil(kBtree); // TODO Das muss raus! + if (m_txn_cursor.is_nil()) + return (st); + couple_to_txnop(); + m_last_cmp = +1; + } + else { + ham_status_t st2 = check_if_btree_key_is_erased_or_overwritten(context); + if (st2 == HAM_TXN_CONFLICT) + st = st2; + } + if (m_txn_cursor.is_nil()) + m_last_cmp = -1; + } + /* if the txn-key is smaller OR if both keys are equal: move next + * with the txn-key (which is chronologically newer) */ + else { + st = m_txn_cursor.move(HAM_CURSOR_NEXT); + if (st == HAM_KEY_NOT_FOUND) { + set_to_nil(kTxn); // TODO Das muss raus! + if (is_nil(kBtree)) + return (st); + couple_to_btree(); + m_last_cmp = -1; + } + if (is_nil(kBtree)) + m_last_cmp = 1; + } + + /* compare keys again */ + if (!is_nil(kBtree) && !m_txn_cursor.is_nil()) + compare(context); + + /* if there's a txn conflict: move next */ + if (st == HAM_TXN_CONFLICT) + return (move_next_key_singlestep(context)); + + /* btree-key is smaller */ + if (m_last_cmp < 0 || m_txn_cursor.is_nil()) { + couple_to_btree(); + update_dupecache(context, kBtree); + return (0); + } + /* txn-key is smaller */ + else if (m_last_cmp > 0 || btrc->get_state() == BtreeCursor::kStateNil) { + couple_to_txnop(); + update_dupecache(context, kTxn); + return (0); + } + /* both keys are equal */ + else { + couple_to_txnop(); + update_dupecache(context, kTxn | kBtree); + return (0); + } +} + +ham_status_t +Cursor::move_next_key(Context *context, uint32_t flags) +{ + ham_status_t st; + + /* are we in the middle of a duplicate list? if yes then move to the + * next duplicate */ + if (get_dupecache_index() > 0 && !(flags & HAM_SKIP_DUPLICATES)) { + st = move_next_dupe(context); + if (st != HAM_LIMITS_REACHED) + return (st); + else if (st == HAM_LIMITS_REACHED && (flags & HAM_ONLY_DUPLICATES)) + return (HAM_KEY_NOT_FOUND); + } + + clear_dupecache(); + + /* either there were no duplicates or we've reached the end of the + * duplicate list. move next till we found a new candidate */ + while (1) { + st = move_next_key_singlestep(context); + if (st) + return (st); + + /* check for duplicates. the dupecache was already updated in + * move_next_key_singlestep() */ + if (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) { + /* are there any duplicates? if not then they were all erased and + * we move to the previous key */ + if (!has_duplicates()) + continue; + + /* otherwise move to the first duplicate */ + return (move_first_dupe(context)); + } + + /* no duplicates - make sure that we've not coupled to an erased + * item */ + if (is_coupled_to_txnop()) { + if (__txn_cursor_is_erase(&m_txn_cursor)) + continue; + else + return (0); + } + if (is_coupled_to_btree()) { + st = check_if_btree_key_is_erased_or_overwritten(context); + if (st == HAM_KEY_ERASED_IN_TXN) + continue; + else if (st == 0) { + couple_to_txnop(); + return (0); + } + else if (st == HAM_KEY_NOT_FOUND) + return (0); + else + return (st); + } + else + return (HAM_KEY_NOT_FOUND); + } + + ham_assert(!"should never reach this"); + return (HAM_INTERNAL_ERROR); +} + +ham_status_t +Cursor::move_previous_key_singlestep(Context *context) +{ + ham_status_t st = 0; + BtreeCursor *btrc = get_btree_cursor(); + + /* if both cursors point to the same key: move previous with both */ + if (m_last_cmp == 0) { + if (!is_nil(kBtree)) { + st = btrc->move(context, 0, 0, 0, 0, + HAM_CURSOR_PREVIOUS | HAM_SKIP_DUPLICATES); + if (st == HAM_KEY_NOT_FOUND || st == HAM_CURSOR_IS_NIL) { + set_to_nil(kBtree); // TODO muss raus + if (m_txn_cursor.is_nil()) + return (HAM_KEY_NOT_FOUND); + else { + couple_to_txnop(); + m_last_cmp = -1; + } + } + } + if (!m_txn_cursor.is_nil()) { + st = m_txn_cursor.move(HAM_CURSOR_PREVIOUS); + if (st == HAM_KEY_NOT_FOUND || st==HAM_CURSOR_IS_NIL) { + set_to_nil(kTxn); // TODO muss raus + if (is_nil(kBtree)) + return (HAM_KEY_NOT_FOUND); + else { + couple_to_btree(); + m_last_cmp = 1; + } + } + } + } + /* if the btree-key is greater: move previous */ + else if (m_last_cmp > 0) { + st = btrc->move(context, 0, 0, 0, 0, + HAM_CURSOR_PREVIOUS | HAM_SKIP_DUPLICATES); + if (st == HAM_KEY_NOT_FOUND) { + set_to_nil(kBtree); // TODO Das muss raus! + if (m_txn_cursor.is_nil()) + return (st); + couple_to_txnop(); + m_last_cmp = -1; + } + else { + ham_status_t st2 = check_if_btree_key_is_erased_or_overwritten(context); + if (st2 == HAM_TXN_CONFLICT) + st = st2; + } + if (m_txn_cursor.is_nil()) + m_last_cmp = 1; + } + /* if the txn-key is greater OR if both keys are equal: move previous + * with the txn-key (which is chronologically newer) */ + else { + st = m_txn_cursor.move(HAM_CURSOR_PREVIOUS); + if (st == HAM_KEY_NOT_FOUND) { + set_to_nil(kTxn); // TODO Das muss raus! + if (is_nil(kBtree)) + return (st); + couple_to_btree(); + m_last_cmp = 1; + + ham_status_t st2 = check_if_btree_key_is_erased_or_overwritten(context); + if (st2 == HAM_TXN_CONFLICT) + st = st2; + } + if (is_nil(kBtree)) + m_last_cmp = -1; + } + + /* compare keys again */ + if (!is_nil(kBtree) && !m_txn_cursor.is_nil()) + compare(context); + + /* if there's a txn conflict: move previous */ + if (st == HAM_TXN_CONFLICT) + return (move_previous_key_singlestep(context)); + + /* btree-key is greater */ + if (m_last_cmp > 0 || m_txn_cursor.is_nil()) { + couple_to_btree(); + update_dupecache(context, kBtree); + return (0); + } + /* txn-key is greater */ + else if (m_last_cmp < 0 || btrc->get_state() == BtreeCursor::kStateNil) { + couple_to_txnop(); + update_dupecache(context, kTxn); + return (0); + } + /* both keys are equal */ + else { + couple_to_txnop(); + update_dupecache(context, kTxn | kBtree); + return (0); + } +} + +ham_status_t +Cursor::move_previous_key(Context *context, uint32_t flags) +{ + ham_status_t st; + + /* are we in the middle of a duplicate list? if yes then move to the + * previous duplicate */ + if (get_dupecache_index() > 0 && !(flags & HAM_SKIP_DUPLICATES)) { + st = move_previous_dupe(context); + if (st != HAM_LIMITS_REACHED) + return (st); + else if (st == HAM_LIMITS_REACHED && (flags & HAM_ONLY_DUPLICATES)) + return (HAM_KEY_NOT_FOUND); + } + + clear_dupecache(); + + /* either there were no duplicates or we've reached the end of the + * duplicate list. move previous till we found a new candidate */ + while (!is_nil(kBtree) || !m_txn_cursor.is_nil()) { + st = move_previous_key_singlestep(context); + if (st) + return (st); + + /* check for duplicates. the dupecache was already updated in + * move_previous_key_singlestep() */ + if (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) { + /* are there any duplicates? if not then they were all erased and + * we move to the previous key */ + if (!has_duplicates()) + continue; + + /* otherwise move to the last duplicate */ + return (move_last_dupe(context)); + } + + /* no duplicates - make sure that we've not coupled to an erased + * item */ + if (is_coupled_to_txnop()) { + if (__txn_cursor_is_erase(&m_txn_cursor)) + continue; + else + return (0); + } + if (is_coupled_to_btree()) { + st = check_if_btree_key_is_erased_or_overwritten(context); + if (st == HAM_KEY_ERASED_IN_TXN) + continue; + else if (st == 0) { + couple_to_txnop(); + return (0); + } + else if (st == HAM_KEY_NOT_FOUND) + return (0); + else + return (st); + } + else + return (HAM_KEY_NOT_FOUND); + } + + return (HAM_KEY_NOT_FOUND); +} + +ham_status_t +Cursor::move_first_key_singlestep(Context *context) +{ + ham_status_t btrs, txns; + BtreeCursor *btrc = get_btree_cursor(); + + /* fetch the smallest key from the transaction tree. */ + txns = m_txn_cursor.move(HAM_CURSOR_FIRST); + /* fetch the smallest key from the btree tree. */ + btrs = btrc->move(context, 0, 0, 0, 0, + HAM_CURSOR_FIRST | HAM_SKIP_DUPLICATES); + /* now consolidate - if both trees are empty then return */ + if (btrs == HAM_KEY_NOT_FOUND && txns == HAM_KEY_NOT_FOUND) { + return (HAM_KEY_NOT_FOUND); + } + /* if btree is empty but txn-tree is not: couple to txn */ + else if (btrs == HAM_KEY_NOT_FOUND && txns != HAM_KEY_NOT_FOUND) { + if (txns == HAM_TXN_CONFLICT) + return (txns); + couple_to_txnop(); + update_dupecache(context, kTxn); + return (0); + } + /* if txn-tree is empty but btree is not: couple to btree */ + else if (txns == HAM_KEY_NOT_FOUND && btrs != HAM_KEY_NOT_FOUND) { + couple_to_btree(); + update_dupecache(context, kBtree); + return (0); + } + /* if both trees are not empty then compare them and couple to the + * smaller one */ + else { + ham_assert(btrs == 0 && (txns == 0 + || txns == HAM_KEY_ERASED_IN_TXN + || txns == HAM_TXN_CONFLICT)); + compare(context); + + /* both keys are equal - couple to txn; it's chronologically + * newer */ + if (m_last_cmp == 0) { + if (txns && txns != HAM_KEY_ERASED_IN_TXN) + return (txns); + couple_to_txnop(); + update_dupecache(context, kBtree | kTxn); + } + /* couple to txn */ + else if (m_last_cmp > 0) { + if (txns && txns != HAM_KEY_ERASED_IN_TXN) + return (txns); + couple_to_txnop(); + update_dupecache(context, kTxn); + } + /* couple to btree */ + else { + couple_to_btree(); + update_dupecache(context, kBtree); + } + return (0); + } +} + +ham_status_t +Cursor::move_first_key(Context *context, uint32_t flags) +{ + ham_status_t st = 0; + + /* move to the very very first key */ + st = move_first_key_singlestep(context); + if (st) + return (st); + + /* check for duplicates. the dupecache was already updated in + * move_first_key_singlestep() */ + if (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) { + /* are there any duplicates? if not then they were all erased and we + * move to the previous key */ + if (!has_duplicates()) + return (move_next_key(context, flags)); + + /* otherwise move to the first duplicate */ + return (move_first_dupe(context)); + } + + /* no duplicates - make sure that we've not coupled to an erased + * item */ + if (is_coupled_to_txnop()) { + if (__txn_cursor_is_erase(&m_txn_cursor)) + return (move_next_key(context, flags)); + else + return (0); + } + if (is_coupled_to_btree()) { + st = check_if_btree_key_is_erased_or_overwritten(context); + if (st == HAM_KEY_ERASED_IN_TXN) + return (move_next_key(context, flags)); + else if (st == 0) { + couple_to_txnop(); + return (0); + } + else if (st == HAM_KEY_NOT_FOUND) + return (0); + else + return (st); + } + else + return (HAM_KEY_NOT_FOUND); +} + +ham_status_t +Cursor::move_last_key_singlestep(Context *context) +{ + ham_status_t btrs, txns; + BtreeCursor *btrc = get_btree_cursor(); + + /* fetch the largest key from the transaction tree. */ + txns = m_txn_cursor.move(HAM_CURSOR_LAST); + /* fetch the largest key from the btree tree. */ + btrs = btrc->move(context, 0, 0, 0, 0, HAM_CURSOR_LAST | HAM_SKIP_DUPLICATES); + /* now consolidate - if both trees are empty then return */ + if (btrs == HAM_KEY_NOT_FOUND && txns == HAM_KEY_NOT_FOUND) { + return (HAM_KEY_NOT_FOUND); + } + /* if btree is empty but txn-tree is not: couple to txn */ + else if (btrs == HAM_KEY_NOT_FOUND && txns != HAM_KEY_NOT_FOUND) { + if (txns == HAM_TXN_CONFLICT) + return (txns); + couple_to_txnop(); + update_dupecache(context, kTxn); + return (0); + } + /* if txn-tree is empty but btree is not: couple to btree */ + else if (txns == HAM_KEY_NOT_FOUND && btrs != HAM_KEY_NOT_FOUND) { + couple_to_btree(); + update_dupecache(context, kBtree); + return (0); + } + /* if both trees are not empty then compare them and couple to the + * greater one */ + else { + ham_assert(btrs == 0 && (txns == 0 + || txns == HAM_KEY_ERASED_IN_TXN + || txns == HAM_TXN_CONFLICT)); + compare(context); + + /* both keys are equal - couple to txn; it's chronologically + * newer */ + if (m_last_cmp == 0) { + if (txns && txns != HAM_KEY_ERASED_IN_TXN) + return (txns); + couple_to_txnop(); + update_dupecache(context, kBtree | kTxn); + } + /* couple to txn */ + else if (m_last_cmp < 1) { + if (txns && txns != HAM_KEY_ERASED_IN_TXN) + return (txns); + couple_to_txnop(); + update_dupecache(context, kTxn); + } + /* couple to btree */ + else { + couple_to_btree(); + update_dupecache(context, kBtree); + } + return (0); + } +} + +ham_status_t +Cursor::move_last_key(Context *context, uint32_t flags) +{ + ham_status_t st = 0; + + /* move to the very very last key */ + st = move_last_key_singlestep(context); + if (st) + return (st); + + /* check for duplicates. the dupecache was already updated in + * move_last_key_singlestep() */ + if (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) { + /* are there any duplicates? if not then they were all erased and we + * move to the previous key */ + if (!has_duplicates()) + return (move_previous_key(context, flags)); + + /* otherwise move to the last duplicate */ + return (move_last_dupe(context)); + } + + /* no duplicates - make sure that we've not coupled to an erased + * item */ + if (is_coupled_to_txnop()) { + if (__txn_cursor_is_erase(&m_txn_cursor)) + return (move_previous_key(context, flags)); + else + return (0); + } + if (is_coupled_to_btree()) { + st = check_if_btree_key_is_erased_or_overwritten(context); + if (st == HAM_KEY_ERASED_IN_TXN) + return (move_previous_key(context, flags)); + else if (st == 0) { + couple_to_txnop(); + return (0); + } + else if (st == HAM_KEY_NOT_FOUND) + return (0); + else + return (st); + } + else + return (HAM_KEY_NOT_FOUND); +} + +ham_status_t +Cursor::move(Context *context, ham_key_t *key, ham_record_t *record, + uint32_t flags) +{ + ham_status_t st = 0; + bool changed_dir = false; + BtreeCursor *btrc = get_btree_cursor(); + + /* no movement requested? directly retrieve key/record */ + if (!flags) + goto retrieve_key_and_record; + + /* synchronize the btree and transaction cursor if the last operation was + * not a move next/previous OR if the direction changed */ + if ((m_lastop == HAM_CURSOR_PREVIOUS) && (flags & HAM_CURSOR_NEXT)) + changed_dir = true; + else if ((m_lastop == HAM_CURSOR_NEXT) && (flags & HAM_CURSOR_PREVIOUS)) + changed_dir = true; + if (((flags & HAM_CURSOR_NEXT) || (flags & HAM_CURSOR_PREVIOUS)) + && (m_lastop == Cursor::kLookupOrInsert + || changed_dir)) { + if (is_coupled_to_txnop()) + set_to_nil(kBtree); + else + set_to_nil(kTxn); + (void)sync(context, flags, 0); + + if (!m_txn_cursor.is_nil() && !is_nil(kBtree)) + compare(context); + } + + /* we have either skipped duplicates or reached the end of the duplicate + * list. btree cursor and txn cursor are synced and as close to + * each other as possible. Move the cursor in the requested direction. */ + if (flags & HAM_CURSOR_NEXT) { + st = move_next_key(context, flags); + } + else if (flags & HAM_CURSOR_PREVIOUS) { + st = move_previous_key(context, flags); + } + else if (flags & HAM_CURSOR_FIRST) { + clear_dupecache(); + st = move_first_key(context, flags); + } + else { + ham_assert(flags & HAM_CURSOR_LAST); + clear_dupecache(); + st = move_last_key(context, flags); + } + + if (st) + return (st); + +retrieve_key_and_record: + /* retrieve key/record, if requested */ + if (st == 0) { + if (is_coupled_to_txnop()) { +#ifdef HAM_DEBUG + TransactionOperation *op = m_txn_cursor.get_coupled_op(); + ham_assert(!(op->get_flags() & TransactionOperation::kErase)); +#endif + try { + if (key) + m_txn_cursor.copy_coupled_key(key); + if (record) + m_txn_cursor.copy_coupled_record(record); + } + catch (Exception &ex) { + return (ex.code); + } + } + else { + st = btrc->move(context, key, &get_db()->key_arena(get_txn()), + record, &get_db()->record_arena(get_txn()), 0); + } + } + + return (st); +} + +bool +Cursor::is_nil(int what) +{ + switch (what) { + case kBtree: + return (m_btree_cursor.get_state() == BtreeCursor::kStateNil); + case kTxn: + return (m_txn_cursor.is_nil()); + default: + ham_assert(what == 0); + return (m_btree_cursor.get_state() == BtreeCursor::kStateNil + && m_txn_cursor.is_nil()); + } +} + +void +Cursor::set_to_nil(int what) +{ + switch (what) { + case kBtree: + m_btree_cursor.set_to_nil(); + break; + case kTxn: + m_txn_cursor.set_to_nil(); + couple_to_btree(); /* reset flag */ + break; + default: + ham_assert(what == 0); + m_btree_cursor.set_to_nil(); + m_txn_cursor.set_to_nil(); + couple_to_btree(); /* reset flag */ + m_is_first_use = true; + break; + } +} + +uint32_t +Cursor::get_record_count(Context *context, uint32_t flags) +{ + if (is_nil()) + throw Exception(HAM_CURSOR_IS_NIL); + + if (m_txn || is_coupled_to_txnop()) { + if (m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS) { + bool dummy; + sync(context, 0, &dummy); + update_dupecache(context, kTxn | kBtree); + return (m_dupecache.get_count()); + } + else { + /* obviously the key exists, since the cursor is coupled */ + return (1); + } + } + + return (m_btree_cursor.get_record_count(context, flags)); +} + +uint64_t +Cursor::get_record_size(Context *context) +{ + if (is_nil()) + return (HAM_CURSOR_IS_NIL); + + if (is_coupled_to_txnop()) + return (m_txn_cursor.get_record_size()); + else + return (m_btree_cursor.get_record_size(context)); +} + +uint32_t +Cursor::get_duplicate_position() +{ + if (is_nil()) + throw Exception(HAM_CURSOR_IS_NIL); + + // use btree cursor? + if (m_txn_cursor.is_nil()) + return (m_btree_cursor.get_duplicate_index()); + + // otherwise return the index in the duplicate cache + return (get_dupecache_index() - 1); +} + +ham_status_t +Cursor::overwrite(Context *context, Transaction *htxn, + ham_record_t *record, uint32_t flags) +{ + ham_status_t st = 0; + LocalTransaction *txn = dynamic_cast<LocalTransaction *>(htxn); + ham_assert(context->txn == txn); + + /* + * if we're in transactional mode then just append an "insert/OW" operation + * to the txn-tree. + * + * if the txn_cursor is already coupled to a txn-op, then we can use + * txn_cursor_overwrite(). Otherwise we have to call db_insert_txn(). + * + * If transactions are disabled then overwrite the item in the btree. + */ + if (txn) { + if (m_txn_cursor.is_nil() && !(is_nil(0))) { + m_btree_cursor.uncouple_from_page(context); + st = m_db->insert_txn(context, + m_btree_cursor.get_uncoupled_key(), + record, flags | HAM_OVERWRITE, get_txn_cursor()); + } + else { + // TODO also calls db->insert_txn() + st = m_txn_cursor.overwrite(context, txn, record); + } + + if (st == 0) + couple_to_txnop(); + } + else { + m_btree_cursor.overwrite(context, record, flags); + couple_to_btree(); + } + + return (st); +} + +void +Cursor::close() +{ + m_btree_cursor.close(); + m_dupecache.clear(); +} + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4cursor/cursor.h b/plugins/Dbx_kv/src/hamsterdb/src/4cursor/cursor.h new file mode 100644 index 0000000000..0adf400ab3 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4cursor/cursor.h @@ -0,0 +1,555 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A Cursor is an object which is used to traverse a Database. + * + * A Cursor structure is separated into 3 components: + * 1. The btree cursor + * This cursor can traverse btrees. It is described and implemented + * in btree_cursor.h. + * 2. The txn cursor + * This cursor can traverse txn-trees. It is described and implemented + * in txn_cursor.h. + * 3. The upper layer + * This layer acts as a kind of dispatcher for both cursors. If + * Transactions are used, then it also uses a duplicate cache for + * consolidating the duplicate keys from both cursors. This layer is + * described and implemented in cursor.h (this file). + * + * A Cursor can have several states. It can be + * 1. NIL (not in list) - this is the default state, meaning that the Cursor + * does not point to any key. If the Cursor was initialized, then it's + * "NIL". If the Cursor was erased (i.e. with ham_cursor_erase) then it's + * also "NIL". + * + * relevant functions: + * Cursor::is_nil + * Cursor::set_to_nil + * + * 2. Coupled to the txn-cursor - meaning that the Cursor points to a key + * that is modified in a Transaction. Technically, the txn-cursor points + * to a TransactionOperation structure. + * + * relevant functions: + * Cursor::is_coupled_to_txnop + * Cursor::couple_to_txnop + * + * 3. Coupled to the btree-cursor - meaning that the Cursor points to a key + * that is stored in a Btree. A Btree cursor itself can then be coupled + * (it directly points to a page in the cache) or uncoupled, meaning that + * the page was purged from the cache and has to be fetched from disk when + * the Cursor is used again. This is described in btree_cursor.h. + * + * relevant functions: + * Cursor::is_coupled_to_btree + * Cursor::couple_to_btree + * + * The dupecache is used when information from the btree and the txn-tree + * is merged. The btree cursor has its private dupecache. The dupecache + * increases performance (and complexity). + * + * The cursor interface is used in db_local.cc. Many of the functions use + * a high-level cursor interface (i.e. @ref cursor_create, @ref cursor_clone) + * while some directly use the low-level interfaces of btree_cursor.h and + * txn_cursor.h. Over time i will clean this up, trying to maintain a clear + * separation of the 3 layers, and only accessing the top-level layer in + * cursor.h. This is work in progress. + * + * In order to speed up Cursor::move() we keep track of the last compare + * between the two cursors. i.e. if the btree cursor is currently pointing to + * a larger key than the txn-cursor, the 'lastcmp' field is <0 etc. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_CURSORS_H +#define HAM_CURSORS_H + +#include "0root/root.h" + +#include <vector> + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "4txn/txn_cursor.h" +#include "3btree/btree_cursor.h" +#include "3blob_manager/blob_manager.h" +#include "4db/db_local.h" +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +// A helper structure; ham_cursor_t is declared in ham/hamsterdb.h as an +// opaque C structure, but internally we use a C++ class. The ham_cursor_t +// struct satisfies the C compiler, and internally we just cast the pointers. +struct ham_cursor_t +{ + bool _dummy; +}; + +namespace hamsterdb { + +struct Context; + +// A single line in the dupecache structure - can reference a btree +// record or a txn-op +class DupeCacheLine +{ + public: + DupeCacheLine(bool use_btree = true, uint64_t btree_dupeidx = 0) + : m_btree_dupeidx(btree_dupeidx), m_op(0), m_use_btree(use_btree) { + ham_assert(use_btree == true); + } + + DupeCacheLine(bool use_btree, TransactionOperation *op) + : m_btree_dupeidx(0), m_op(op), m_use_btree(use_btree) { + ham_assert(use_btree == false); + } + + // Returns true if this cache entry is a duplicate in the btree index + // (otherwise it's a duplicate in the transaction index) + bool use_btree() const { + return (m_use_btree); + } + + // Returns the btree duplicate index + uint64_t get_btree_dupe_idx() { + ham_assert(m_use_btree == true); + return (m_btree_dupeidx); + } + + // Sets the btree duplicate index + void set_btree_dupe_idx(uint64_t idx) { + m_use_btree = true; + m_btree_dupeidx = idx; + m_op = 0; + } + + // Returns the txn-op duplicate + TransactionOperation *get_txn_op() { + ham_assert(m_use_btree == false); + return (m_op); + } + + // Sets the txn-op duplicate + void set_txn_op(TransactionOperation *op) { + m_use_btree = false; + m_op = op; + m_btree_dupeidx = 0; + } + + private: + // The btree duplicate index (of the original btree dupe table) + uint64_t m_btree_dupeidx; + + // The txn op structure that we refer to + TransactionOperation *m_op; + + // using btree or txn duplicates? + bool m_use_btree; +}; + +// +// The dupecache is a cache for duplicate keys +// +class DupeCache { + public: + // default constructor - creates an empty dupecache with room for 8 + // duplicates + DupeCache() { + m_elements.reserve(8); + } + + // Returns the number of elements in the cache + uint32_t get_count() const { + return ((uint32_t)m_elements.size()); + } + + // Returns an element from the cache + DupeCacheLine *get_element(unsigned idx) { + return (&m_elements[idx]); + } + + // Returns a pointer to the first element from the cache + DupeCacheLine *get_first_element() { + return (&m_elements[0]); + } + + // Clones this dupe-cache into 'other' + void clone(DupeCache *other) { + other->m_elements = m_elements; + } + + // Inserts a new item somewhere in the cache; resizes the + // cache if necessary + void insert(unsigned position, const DupeCacheLine &dcl) { + m_elements.insert(m_elements.begin() + position, dcl); + } + + // Append an element to the dupecache + void append(const DupeCacheLine &dcl) { + m_elements.push_back(dcl); + } + + // Erases an item + void erase(uint32_t position) { + m_elements.erase(m_elements.begin() + position); + } + + // Clears the cache; frees all resources + void clear() { + m_elements.resize(0); + } + + private: + // The cached elements + std::vector<DupeCacheLine> m_elements; +}; + + +// +// the Database Cursor +// +class Cursor +{ + public: + // The flags have ranges: + // 0 - 0x1000000-1: btree_cursor + // > 0x1000000: cursor + enum { + // Flags for set_to_nil, is_nil + kBoth = 0, + kBtree = 1, + kTxn = 2, + + // Flag for sync(): do not use approx matching if the key + // is not available + kSyncOnlyEqualKeys = 0x200000, + + // Flag for sync(): do not load the key if there's an approx. + // match. Only positions the cursor. + kSyncDontLoadKey = 0x100000, + + // Cursor flag: cursor is coupled to the txn-cursor + kCoupledToTxn = 0x1000000, + + // Flag for set_lastop() + kLookupOrInsert = 0x10000 + }; + + public: + // Constructor; retrieves pointer to db and txn, initializes all members + Cursor(LocalDatabase *db, Transaction *txn = 0, uint32_t flags = 0); + + // Copy constructor; used for cloning a Cursor + Cursor(Cursor &other); + + // Destructor; sets cursor to nil + ~Cursor() { + set_to_nil(); + } + + // Returns the Database + LocalDatabase *get_db() { + return (m_db); + } + + // Returns the Transaction handle + Transaction *get_txn() { + return (m_txn); + } + + // Sets the Transaction handle; often used to assign a temporary + // Transaction to this cursor + void set_txn(Transaction *txn) { + m_txn = txn; + } + + // Sets the cursor to nil + void set_to_nil(int what = kBoth); + + // Returns true if a cursor is nil (Not In List - does not point to any + // key) + // |what| is one of the flags kBoth, kTxn, kBtree + bool is_nil(int what = kBoth); + + // Couples the cursor to the btree key + void couple_to_btree() { + m_flags &= ~kCoupledToTxn; + } + + // Returns true if a cursor is coupled to the btree + bool is_coupled_to_btree() const { + return (!(m_flags & kCoupledToTxn)); + } + + // Couples the cursor to the txn-op + void couple_to_txnop() { + m_flags |= kCoupledToTxn; + } + + // Returns true if a cursor is coupled to a txn-op + bool is_coupled_to_txnop() const { + return ((m_flags & kCoupledToTxn) ? true : false); + } + + // Retrieves the number of duplicates of the current key + uint32_t get_record_count(Context *context, uint32_t flags); + + // Retrieves the duplicate position of a cursor + uint32_t get_duplicate_position(); + + // Retrieves the size of the current record + uint64_t get_record_size(Context *context); + + // Overwrites the record of the current key + // + // The Transaction is passed as a separate pointer since it might be a + // local/temporary Transaction that was created only for this single + // operation. + ham_status_t overwrite(Context *context, Transaction *txn, + ham_record_t *record, uint32_t flags); + + // Moves a Cursor (ham_cursor_move) + ham_status_t move(Context *context, ham_key_t *key, ham_record_t *record, + uint32_t flags); + + // Closes an existing cursor (ham_cursor_close) + void close(); + + // Updates (or builds) the dupecache for a cursor + // + // The |what| parameter specifies if the dupecache is initialized from + // btree (kBtree), from txn (kTxn) or both. + void update_dupecache(Context *context, uint32_t what); + + // Appends the duplicates of the BtreeCursor to the duplicate cache. + void append_btree_duplicates(Context *context, BtreeCursor *btc, + DupeCache *dc); + + // Clears the dupecache and disconnect the Cursor from any duplicate key + void clear_dupecache() { + m_dupecache.clear(); + set_dupecache_index(0); + } + + // Couples the cursor to a duplicate in the dupe table + // dupe_id is a 1 based index!! + void couple_to_dupe(uint32_t dupe_id); + + // Synchronizes txn- and btree-cursor + // + // If txn-cursor is nil then try to move the txn-cursor to the same key + // as the btree cursor. + // If btree-cursor is nil then try to move the btree-cursor to the same key + // as the txn cursor. + // If both are nil, or both are valid, then nothing happens + // + // |equal_key| is set to true if the keys in both cursors are equal. + void sync(Context *context, uint32_t flags, bool *equal_keys); + + // Returns the number of duplicates in the duplicate cache + // The duplicate cache is updated if necessary + uint32_t get_dupecache_count(Context *context) { + if (!(m_db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS)) + return (0); + + TransactionCursor *txnc = get_txn_cursor(); + if (txnc->get_coupled_op()) + update_dupecache(context, kBtree | kTxn); + else + update_dupecache(context, kBtree); + return (m_dupecache.get_count()); + } + + // Get the 'next' Cursor in this Database + Cursor *get_next() { + return (m_next); + } + + // Set the 'next' Cursor in this Database + void set_next(Cursor *next) { + m_next = next; + } + + // Get the 'previous' Cursor in this Database + Cursor *get_previous() { + return (m_previous); + } + + // Set the 'previous' Cursor in this Database + void set_previous(Cursor *previous) { + m_previous = previous; + } + + // Returns the Transaction cursor + // TODO required? + TransactionCursor *get_txn_cursor() { + return (&m_txn_cursor); + } + + // Returns the Btree cursor + // TODO required? + BtreeCursor *get_btree_cursor() { + return (&m_btree_cursor); + } + + // Returns the remote Cursor handle + uint64_t get_remote_handle() { + return (m_remote_handle); + } + + // Returns the remote Cursor handle + void set_remote_handle(uint64_t handle) { + m_remote_handle = handle; + } + + // Returns a pointer to the duplicate cache + // TODO really required? + DupeCache *get_dupecache() { + return (&m_dupecache); + } + + // Returns a pointer to the duplicate cache + // TODO really required? + const DupeCache *get_dupecache() const { + return (&m_dupecache); + } + + // Returns the current index in the dupe cache + uint32_t get_dupecache_index() const { + return (m_dupecache_index); + } + + // Sets the current index in the dupe cache + void set_dupecache_index(uint32_t index) { + m_dupecache_index = index; + } + + // Returns true if this cursor was never used before + // TODO this is identical to is_nil()?? + bool is_first_use() const { + return (m_is_first_use); + } + + // Stores the current operation; needed for ham_cursor_move + // TODO should be private + void set_lastop(uint32_t lastop) { + m_lastop = lastop; + m_is_first_use = false; + } + + private: + // Checks if a btree cursor points to a key that was overwritten or erased + // in the txn-cursor + // + // This is needed when moving the cursor backwards/forwards + // and consolidating the btree and the txn-tree + ham_status_t check_if_btree_key_is_erased_or_overwritten(Context *context); + + // Compares btree and txn-cursor; stores result in lastcmp + int compare(Context *context); + + // Returns true if this key has duplicates + bool has_duplicates() const { + return (m_dupecache.get_count() > 0); + } + + // Moves cursor to the first duplicate + ham_status_t move_first_dupe(Context *context); + + // Moves cursor to the last duplicate + ham_status_t move_last_dupe(Context *context); + + // Moves cursor to the next duplicate + ham_status_t move_next_dupe(Context *context); + + // Moves cursor to the previous duplicate + ham_status_t move_previous_dupe(Context *context); + + // Moves cursor to the first key + ham_status_t move_first_key(Context *context, uint32_t flags); + + // Moves cursor to the last key + ham_status_t move_last_key(Context *context, uint32_t flags); + + // Moves cursor to the next key + ham_status_t move_next_key(Context *context, uint32_t flags); + + // Moves cursor to the previous key + ham_status_t move_previous_key(Context *context, uint32_t flags); + + // Moves cursor to the first key - helper function + ham_status_t move_first_key_singlestep(Context *context); + + // Moves cursor to the last key - helper function + ham_status_t move_last_key_singlestep(Context *context); + + // Moves cursor to the next key - helper function + ham_status_t move_next_key_singlestep(Context *context); + + // Moves cursor to the previous key - helper function + ham_status_t move_previous_key_singlestep(Context *context); + + // Pointer to the Database object + LocalDatabase *m_db; + + // Pointer to the Transaction + Transaction *m_txn; + + // A Cursor which can walk over Transaction trees + TransactionCursor m_txn_cursor; + + // A Cursor which can walk over B+trees + BtreeCursor m_btree_cursor; + + // The remote database handle + uint64_t m_remote_handle; + + // Linked list of all Cursors in this Database + Cursor *m_next, *m_previous; + + // A cache for all duplicates of the current key. needed for + // ham_cursor_move, ham_find and other functions. The cache is + // used to consolidate all duplicates of btree and txn. + DupeCache m_dupecache; + + /** The current position of the cursor in the cache. This is a + * 1-based index. 0 means that the cache is not in use. */ + uint32_t m_dupecache_index; + + // The last operation (insert/find or move); needed for + // ham_cursor_move. Values can be HAM_CURSOR_NEXT, + // HAM_CURSOR_PREVIOUS or CURSOR_LOOKUP_INSERT + uint32_t m_lastop; + + // The result of the last compare operation + int m_last_cmp; + + // Cursor flags + uint32_t m_flags; + + // true if this cursor was never used + bool m_is_first_use; +}; + +} // namespace hamsterdb + +#endif /* HAM_CURSORS_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4db/db.cc b/plugins/Dbx_kv/src/hamsterdb/src/4db/db.cc new file mode 100644 index 0000000000..7d6cd82929 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4db/db.cc @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "4db/db.h" +#include "4cursor/cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +Database::Database(Environment *env, DatabaseConfiguration &config) + : m_env(env), m_config(config), m_error(0), m_context(0), m_cursor_list(0) +{ +} + +ham_status_t +Database::cursor_create(Cursor **pcursor, Transaction *txn, uint32_t flags) +{ + try { + Cursor *cursor = cursor_create_impl(txn, flags); + + /* fix the linked list of cursors */ + cursor->set_next(m_cursor_list); + if (m_cursor_list) + m_cursor_list->set_previous(cursor); + m_cursor_list = cursor; + + if (txn) + txn->increase_cursor_refcount(); + + *pcursor = cursor; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Database::cursor_clone(Cursor **pdest, Cursor *src) +{ + try { + Cursor *dest = cursor_clone_impl(src); + + // fix the linked list of cursors + dest->set_previous(0); + dest->set_next(m_cursor_list); + ham_assert(m_cursor_list != 0); + m_cursor_list->set_previous(dest); + m_cursor_list = dest; + + // initialize the remaining fields + if (src->get_txn()) + src->get_txn()->increase_cursor_refcount(); + + *pdest = dest; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Database::cursor_close(Cursor *cursor) +{ + try { + Cursor *p, *n; + + // first close the cursor + cursor_close_impl(cursor); + + // decrease the transaction refcount; the refcount specifies how many + // cursors are attached to the transaction + if (cursor->get_txn()) + cursor->get_txn()->decrease_cursor_refcount(); + + // fix the linked list of cursors + p = cursor->get_previous(); + n = cursor->get_next(); + + if (p) + p->set_next(n); + else + m_cursor_list = n; + + if (n) + n->set_previous(p); + + cursor->set_next(0); + cursor->set_previous(0); + + delete cursor; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +// No need to catch Exceptions - they're caught in Environment::close_db +ham_status_t +Database::close(uint32_t flags) +{ + // auto-cleanup cursors? + if (flags & HAM_AUTO_CLEANUP) { + Cursor *cursor; + while ((cursor = m_cursor_list)) + cursor_close(cursor); + } + else if (m_cursor_list) { + ham_trace(("cannot close Database if Cursors are still open")); + return (set_error(HAM_CURSOR_STILL_OPEN)); + } + + // the derived classes can now do the bulk of the work + ham_status_t st = close_impl(flags); + if (st) + return (set_error(st)); + + m_env = 0; + return (0); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4db/db.h b/plugins/Dbx_kv/src/hamsterdb/src/4db/db.h new file mode 100644 index 0000000000..0290cc86b0 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4db/db.h @@ -0,0 +1,232 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: no + */ + +#ifndef HAM_DB_H +#define HAM_DB_H + +#include "0root/root.h" + +#include "ham/hamsterdb_int.h" +#include "ham/hamsterdb_ola.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/dynamic_array.h" +#include "2config/db_config.h" +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +// A helper structure; ham_db_t is declared in ham/hamsterdb.h as an +// opaque C structure, but internally we use a C++ class. The ham_db_t +// struct satisfies the C compiler, and internally we just cast the pointers. +struct ham_db_t { + int dummy; +}; + +namespace hamsterdb { + +class Cursor; +struct ScanVisitor; + +/* + * An abstract base class for a Database; is overwritten for local and + * remote implementations + */ +class Database +{ + public: + // Constructor + Database(Environment *env, DatabaseConfiguration &config); + + virtual ~Database() { + } + + // Returns the Environment pointer + Environment *get_env() { + return (m_env); + } + + // Returns the Database's configuration + const DatabaseConfiguration &config() const { + return (m_config); + } + + // Returns the runtime-flags - the flags are "mixed" with the flags from + // the Environment + uint32_t get_flags() { + return (m_env->get_flags() | m_config.flags); + } + + // Returns the database name + uint16_t name() const { + return (m_config.db_name); + } + + // Sets the database name + void set_name(uint16_t name) { + m_config.db_name = name; + } + + // Fills in the current metrics + virtual void fill_metrics(ham_env_metrics_t *metrics) = 0; + + // Returns Database parameters (ham_db_get_parameters) + virtual ham_status_t get_parameters(ham_parameter_t *param) = 0; + + // Checks Database integrity (ham_db_check_integrity) + virtual ham_status_t check_integrity(uint32_t flags) = 0; + + // Returns the number of keys (ham_db_get_key_count) + virtual ham_status_t count(Transaction *txn, bool distinct, + uint64_t *pcount) = 0; + + // Scans the whole database, applies a processor function + virtual ham_status_t scan(Transaction *txn, ScanVisitor *visitor, + bool distinct) = 0; + + // Inserts a key/value pair (ham_db_insert, ham_cursor_insert) + virtual ham_status_t insert(Cursor *cursor, Transaction *txn, + ham_key_t *key, ham_record_t *record, uint32_t flags) = 0; + + // Erase a key/value pair (ham_db_erase, ham_cursor_erase) + virtual ham_status_t erase(Cursor *cursor, Transaction *txn, ham_key_t *key, + uint32_t flags) = 0; + + // Lookup of a key/value pair (ham_db_find, ham_cursor_find) + virtual ham_status_t find(Cursor *cursor, Transaction *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags) = 0; + + // Creates a cursor (ham_cursor_create) + virtual ham_status_t cursor_create(Cursor **pcursor, Transaction *txn, + uint32_t flags); + + // Clones a cursor (ham_cursor_clone) + virtual ham_status_t cursor_clone(Cursor **pdest, Cursor *src); + + // Returns number of duplicates (ham_cursor_get_record_count) + virtual ham_status_t cursor_get_record_count(Cursor *cursor, + uint32_t flags, uint32_t *pcount) = 0; + + // Returns position in duplicate list (ham_cursor_get_duplicate_position) + virtual ham_status_t cursor_get_duplicate_position(Cursor *cursor, + uint32_t *pposition) = 0; + + // Get current record size (ham_cursor_get_record_size) + virtual ham_status_t cursor_get_record_size(Cursor *cursor, + uint64_t *psize) = 0; + + // Overwrites the record of a cursor (ham_cursor_overwrite) + virtual ham_status_t cursor_overwrite(Cursor *cursor, + ham_record_t *record, uint32_t flags) = 0; + + // Moves a cursor, returns key and/or record (ham_cursor_move) + virtual ham_status_t cursor_move(Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags) = 0; + + // Closes a cursor (ham_cursor_close) + ham_status_t cursor_close(Cursor *cursor); + + // Closes the Database (ham_db_close) + ham_status_t close(uint32_t flags); + + // Returns the last error code + ham_status_t get_error() const { + return (m_error); + } + + // Sets the last error code + ham_status_t set_error(ham_status_t e) { + return ((m_error = e)); + } + + // Returns the user-provided context pointer (ham_get_context_data) + void *get_context_data() { + return (m_context); + } + + // Sets the user-provided context pointer (ham_set_context_data) + void set_context_data(void *ctxt) { + m_context = ctxt; + } + + // Returns the head of the linked list with all cursors + Cursor *cursor_list() { + return (m_cursor_list); + } + + // Returns the memory buffer for the key data: the per-database buffer + // if |txn| is null or temporary, otherwise the buffer from the |txn| + ByteArray &key_arena(Transaction *txn) { + return ((txn == 0 || (txn->get_flags() & HAM_TXN_TEMPORARY)) + ? m_key_arena + : txn->key_arena()); + } + + // Returns the memory buffer for the record data: the per-database buffer + // if |txn| is null or temporary, otherwise the buffer from the |txn| + ByteArray &record_arena(Transaction *txn) { + return ((txn == 0 || (txn->get_flags() & HAM_TXN_TEMPORARY)) + ? m_record_arena + : txn->record_arena()); + } + + protected: + // Creates a cursor; this is the actual implementation + virtual Cursor *cursor_create_impl(Transaction *txn, uint32_t flags) = 0; + + // Clones a cursor; this is the actual implementation + virtual Cursor *cursor_clone_impl(Cursor *src) = 0; + + // Closes a cursor; this is the actual implementation + virtual void cursor_close_impl(Cursor *c) = 0; + + // Closes a database; this is the actual implementation + virtual ham_status_t close_impl(uint32_t flags) = 0; + + // the current Environment + Environment *m_env; + + // the configuration settings + DatabaseConfiguration m_config; + + // the last error code + ham_status_t m_error; + + // the user-provided context data + void *m_context; + + // linked list of all cursors + Cursor *m_cursor_list; + + // This is where key->data points to when returning a + // key to the user; used if Transactions are disabled + ByteArray m_key_arena; + + // This is where record->data points to when returning a + // record to the user; used if Transactions are disabled + ByteArray m_record_arena; +}; + +} // namespace hamsterdb + +#endif /* HAM_DB_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4db/db_local.cc b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_local.cc new file mode 100644 index 0000000000..849eb4e7aa --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_local.cc @@ -0,0 +1,1776 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <boost/scope_exit.hpp> + +// Always verify that a file of level N does not include headers > N! +#include "1mem/mem.h" +#include "1os/os.h" +#include "2page/page.h" +#include "2device/device.h" +#include "3page_manager/page_manager.h" +#include "3journal/journal.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_index_factory.h" +#include "3btree/btree_cursor.h" +#include "3btree/btree_stats.h" +#include "4db/db_local.h" +#include "4context/context.h" +#include "4cursor/cursor.h" +#include "4txn/txn_local.h" +#include "4txn/txn_cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +ham_status_t +LocalDatabase::check_insert_conflicts(Context *context, TransactionNode *node, + ham_key_t *key, uint32_t flags) +{ + TransactionOperation *op = 0; + + /* + * pick the tree_node of this key, and walk through each operation + * in reverse chronological order (from newest to oldest): + * - is this op part of an aborted txn? then skip it + * - is this op part of a committed txn? then look at the + * operation in detail + * - is this op part of an txn which is still active? return an error + * because we've found a conflict + * - if a committed txn has erased the item then there's no need + * to continue checking older, committed txns + */ + op = node->get_newest_op(); + while (op) { + LocalTransaction *optxn = op->get_txn(); + if (optxn->is_aborted()) + ; /* nop */ + else if (optxn->is_committed() || context->txn == optxn) { + /* if key was erased then it doesn't exist and can be + * inserted without problems */ + if (op->get_flags() & TransactionOperation::kIsFlushed) + ; /* nop */ + else if (op->get_flags() & TransactionOperation::kErase) + return (0); + /* if the key already exists then we can only continue if + * we're allowed to overwrite it or to insert a duplicate */ + else if ((op->get_flags() & TransactionOperation::kInsert) + || (op->get_flags() & TransactionOperation::kInsertOverwrite) + || (op->get_flags() & TransactionOperation::kInsertDuplicate)) { + if ((flags & HAM_OVERWRITE) || (flags & HAM_DUPLICATE)) + return (0); + else + return (HAM_DUPLICATE_KEY); + } + else if (!(op->get_flags() & TransactionOperation::kNop)) { + ham_assert(!"shouldn't be here"); + return (HAM_DUPLICATE_KEY); + } + } + else { /* txn is still active */ + return (HAM_TXN_CONFLICT); + } + + op = op->get_previous_in_node(); + } + + /* + * we've successfully checked all un-flushed transactions and there + * were no conflicts. Now check all transactions which are already + * flushed - basically that's identical to a btree lookup. + * + * however we can skip this check if we do not care about duplicates. + */ + if ((flags & HAM_OVERWRITE) + || (flags & HAM_DUPLICATE) + || (get_flags() & (HAM_RECORD_NUMBER32 | HAM_RECORD_NUMBER64))) + return (0); + + ham_status_t st = m_btree_index->find(context, 0, key, 0, 0, 0, flags); + switch (st) { + case HAM_KEY_NOT_FOUND: + return (0); + case HAM_SUCCESS: + return (HAM_DUPLICATE_KEY); + default: + return (st); + } +} + +ham_status_t +LocalDatabase::check_erase_conflicts(Context *context, TransactionNode *node, + ham_key_t *key, uint32_t flags) +{ + TransactionOperation *op = 0; + + /* + * pick the tree_node of this key, and walk through each operation + * in reverse chronological order (from newest to oldest): + * - is this op part of an aborted txn? then skip it + * - is this op part of a committed txn? then look at the + * operation in detail + * - is this op part of an txn which is still active? return an error + * because we've found a conflict + * - if a committed txn has erased the item then there's no need + * to continue checking older, committed txns + */ + op = node->get_newest_op(); + while (op) { + Transaction *optxn = op->get_txn(); + if (optxn->is_aborted()) + ; /* nop */ + else if (optxn->is_committed() || context->txn == optxn) { + if (op->get_flags() & TransactionOperation::kIsFlushed) + ; /* nop */ + /* if key was erased then it doesn't exist and we fail with + * an error */ + else if (op->get_flags() & TransactionOperation::kErase) + return (HAM_KEY_NOT_FOUND); + /* if the key exists then we're successful */ + else if ((op->get_flags() & TransactionOperation::kInsert) + || (op->get_flags() & TransactionOperation::kInsertOverwrite) + || (op->get_flags() & TransactionOperation::kInsertDuplicate)) { + return (0); + } + else if (!(op->get_flags() & TransactionOperation::kNop)) { + ham_assert(!"shouldn't be here"); + return (HAM_KEY_NOT_FOUND); + } + } + else { /* txn is still active */ + return (HAM_TXN_CONFLICT); + } + + op = op->get_previous_in_node(); + } + + /* + * we've successfully checked all un-flushed transactions and there + * were no conflicts. Now check all transactions which are already + * flushed - basically that's identical to a btree lookup. + */ + return (m_btree_index->find(context, 0, key, 0, 0, 0, flags)); +} + +ham_status_t +LocalDatabase::insert_txn(Context *context, ham_key_t *key, + ham_record_t *record, uint32_t flags, TransactionCursor *cursor) +{ + ham_status_t st = 0; + TransactionOperation *op; + bool node_created = false; + + /* get (or create) the node for this key */ + TransactionNode *node = m_txn_index->get(key, 0); + if (!node) { + node = new TransactionNode(this, key); + node_created = true; + // TODO only store when the operation is successful? + m_txn_index->store(node); + } + + // check for conflicts of this key + // + // !! + // afterwards, clear the changeset; check_insert_conflicts() + // checks if a key already exists, and this fills the changeset + st = check_insert_conflicts(context, node, key, flags); + if (st) { + if (node_created) { + m_txn_index->remove(node); + delete node; + } + return (st); + } + + // append a new operation to this node + op = node->append(context->txn, flags, + (flags & HAM_PARTIAL) | + ((flags & HAM_DUPLICATE) + ? TransactionOperation::kInsertDuplicate + : (flags & HAM_OVERWRITE) + ? TransactionOperation::kInsertOverwrite + : TransactionOperation::kInsert), + lenv()->next_lsn(), key, record); + + // if there's a cursor then couple it to the op; also store the + // dupecache-index in the op (it's needed for DUPLICATE_INSERT_BEFORE/NEXT) */ + if (cursor) { + Cursor *c = cursor->get_parent(); + if (c->get_dupecache_index()) + op->set_referenced_dupe(c->get_dupecache_index()); + + cursor->couple_to_op(op); + + // all other cursors need to increment their dupe index, if their + // index is > this cursor's index + increment_dupe_index(context, node, c, c->get_dupecache_index()); + } + + // append journal entry + if (m_env->get_flags() & HAM_ENABLE_RECOVERY + && m_env->get_flags() & HAM_ENABLE_TRANSACTIONS) { + Journal *j = lenv()->journal(); + j->append_insert(this, context->txn, key, record, + flags & HAM_DUPLICATE ? flags : flags | HAM_OVERWRITE, + op->get_lsn()); + } + + ham_assert(st == 0); + return (0); +} + +ham_status_t +LocalDatabase::find_txn(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags) +{ + ham_status_t st = 0; + TransactionOperation *op = 0; + bool first_loop = true; + bool exact_is_erased = false; + + ByteArray *pkey_arena = &key_arena(context->txn); + ByteArray *precord_arena = &record_arena(context->txn); + + ham_key_set_intflags(key, + (ham_key_get_intflags(key) & (~BtreeKey::kApproximate))); + + /* get the node for this key (but don't create a new one if it does + * not yet exist) */ + TransactionNode *node = m_txn_index->get(key, flags); + + /* + * pick the node of this key, and walk through each operation + * in reverse chronological order (from newest to oldest): + * - is this op part of an aborted txn? then skip it + * - is this op part of a committed txn? then look at the + * operation in detail + * - is this op part of an txn which is still active? return an error + * because we've found a conflict + * - if a committed txn has erased the item then there's no need + * to continue checking older, committed txns + */ +retry: + if (node) + op = node->get_newest_op(); + while (op) { + Transaction *optxn = op->get_txn(); + if (optxn->is_aborted()) + ; /* nop */ + else if (optxn->is_committed() || context->txn == optxn) { + if (op->get_flags() & TransactionOperation::kIsFlushed) + ; /* nop */ + /* if key was erased then it doesn't exist and we can return + * immediately + * + * if an approximate match is requested then move to the next + * or previous node + */ + else if (op->get_flags() & TransactionOperation::kErase) { + if (first_loop + && !(ham_key_get_intflags(key) & BtreeKey::kApproximate)) + exact_is_erased = true; + first_loop = false; + if (flags & HAM_FIND_LT_MATCH) { + node = node->get_previous_sibling(); + if (!node) + break; + ham_key_set_intflags(key, + (ham_key_get_intflags(key) | BtreeKey::kApproximate)); + goto retry; + } + else if (flags & HAM_FIND_GT_MATCH) { + node = node->get_next_sibling(); + if (!node) + break; + ham_key_set_intflags(key, + (ham_key_get_intflags(key) | BtreeKey::kApproximate)); + goto retry; + } + /* if a duplicate was deleted then check if there are other duplicates + * left */ + st = HAM_KEY_NOT_FOUND; + // TODO merge both calls + if (cursor) { + cursor->get_txn_cursor()->couple_to_op(op); + cursor->couple_to_txnop(); + } + if (op->get_referenced_dupe() > 1) { + // not the first dupe - there are other dupes + st = 0; + } + else if (op->get_referenced_dupe() == 1) { + // check if there are other dupes + bool is_equal; + (void)cursor->sync(context, Cursor::kSyncOnlyEqualKeys, &is_equal); + if (!is_equal) // TODO merge w/ line above? + cursor->set_to_nil(Cursor::kBtree); + st = cursor->get_dupecache_count(context) ? 0 : HAM_KEY_NOT_FOUND; + } + return (st); + } + /* if the key already exists then return its record; do not + * return pointers to TransactionOperation::get_record, because it may be + * flushed and the user's pointers would be invalid */ + else if ((op->get_flags() & TransactionOperation::kInsert) + || (op->get_flags() & TransactionOperation::kInsertOverwrite) + || (op->get_flags() & TransactionOperation::kInsertDuplicate)) { + if (cursor) { // TODO merge those calls + cursor->get_txn_cursor()->couple_to_op(op); + cursor->couple_to_txnop(); + } + // approx match? leave the loop and continue + // with the btree + if (ham_key_get_intflags(key) & BtreeKey::kApproximate) + break; + // otherwise copy the record and return + if (record) + return (LocalDatabase::copy_record(this, context->txn, op, record)); + return (0); + } + else if (!(op->get_flags() & TransactionOperation::kNop)) { + ham_assert(!"shouldn't be here"); + return (HAM_KEY_NOT_FOUND); + } + } + else { /* txn is still active */ + return (HAM_TXN_CONFLICT); + } + + op = op->get_previous_in_node(); + } + + /* + * if there was an approximate match: check if the btree provides + * a better match + * + * TODO use alloca or ByteArray instead of Memory::allocate() + */ + if (op && ham_key_get_intflags(key) & BtreeKey::kApproximate) { + ham_key_t txnkey = {0}; + ham_key_t *k = op->get_node()->get_key(); + txnkey.size = k->size; + txnkey._flags = BtreeKey::kApproximate; + txnkey.data = Memory::allocate<uint8_t>(txnkey.size); + memcpy(txnkey.data, k->data, txnkey.size); + + ham_key_set_intflags(key, 0); + + // the "exact match" key was erased? then don't fetch it again + if (exact_is_erased) + flags = flags & (~HAM_FIND_EXACT_MATCH); + + // now lookup in the btree + if (cursor) + cursor->set_to_nil(Cursor::kBtree); + st = m_btree_index->find(context, cursor, key, pkey_arena, record, + precord_arena, flags); + if (st == HAM_KEY_NOT_FOUND) { + if (!(key->flags & HAM_KEY_USER_ALLOC) && txnkey.data) { + pkey_arena->resize(txnkey.size); + key->data = pkey_arena->get_ptr(); + } + if (txnkey.data) { + ::memcpy(key->data, txnkey.data, txnkey.size); + Memory::release(txnkey.data); + } + key->size = txnkey.size; + key->_flags = txnkey._flags; + + if (cursor) { // TODO merge those calls + cursor->get_txn_cursor()->couple_to_op(op); + cursor->couple_to_txnop(); + } + if (record) + return (LocalDatabase::copy_record(this, context->txn, op, record)); + return (0); + } + else if (st) + return (st); + // the btree key is a direct match? then return it + if ((!(ham_key_get_intflags(key) & BtreeKey::kApproximate)) + && (flags & HAM_FIND_EXACT_MATCH)) { + Memory::release(txnkey.data); + if (cursor) + cursor->couple_to_btree(); + return (0); + } + // if there's an approx match in the btree: compare both keys and + // use the one that is closer. if the btree is closer: make sure + // that it was not erased or overwritten in a transaction + int cmp = m_btree_index->compare_keys(key, &txnkey); + bool use_btree = false; + if (flags & HAM_FIND_GT_MATCH) { + if (cmp < 0) + use_btree = true; + } + else if (flags & HAM_FIND_LT_MATCH) { + if (cmp > 0) + use_btree = true; + } + else + ham_assert(!"shouldn't be here"); + + if (use_btree) { + Memory::release(txnkey.data); + // lookup again, with the same flags and the btree key. + // this will check if the key was erased or overwritten + // in a transaction + st = find_txn(context, cursor, key, record, flags | HAM_FIND_EXACT_MATCH); + if (st == 0) + ham_key_set_intflags(key, + (ham_key_get_intflags(key) | BtreeKey::kApproximate)); + return (st); + } + else { // use txn + if (!(key->flags & HAM_KEY_USER_ALLOC) && txnkey.data) { + pkey_arena->resize(txnkey.size); + key->data = pkey_arena->get_ptr(); + } + if (txnkey.data) { + ::memcpy(key->data, txnkey.data, txnkey.size); + Memory::release(txnkey.data); + } + key->size = txnkey.size; + key->_flags = txnkey._flags; + + if (cursor) { // TODO merge those calls + cursor->get_txn_cursor()->couple_to_op(op); + cursor->couple_to_txnop(); + } + if (record) + return (LocalDatabase::copy_record(this, context->txn, op, record)); + return (0); + } + } + + /* + * no approximate match: + * + * we've successfully checked all un-flushed transactions and there + * were no conflicts, and we have not found the key: now try to + * lookup the key in the btree. + */ + return (m_btree_index->find(context, cursor, key, pkey_arena, record, + precord_arena, flags)); +} + +ham_status_t +LocalDatabase::erase_txn(Context *context, ham_key_t *key, uint32_t flags, + TransactionCursor *cursor) +{ + ham_status_t st = 0; + TransactionOperation *op; + bool node_created = false; + Cursor *pc = 0; + if (cursor) + pc = cursor->get_parent(); + + /* get (or create) the node for this key */ + TransactionNode *node = m_txn_index->get(key, 0); + if (!node) { + node = new TransactionNode(this, key); + node_created = true; + // TODO only store when the operation is successful? + m_txn_index->store(node); + } + + /* check for conflicts of this key - but only if we're not erasing a + * duplicate key. dupes are checked for conflicts in _local_cursor_move TODO that function no longer exists */ + if (!pc || (!pc->get_dupecache_index())) { + st = check_erase_conflicts(context, node, key, flags); + if (st) { + if (node_created) { + m_txn_index->remove(node); + delete node; + } + return (st); + } + } + + /* append a new operation to this node */ + op = node->append(context->txn, flags, TransactionOperation::kErase, + lenv()->next_lsn(), key, 0); + + /* is this function called through ham_cursor_erase? then add the + * duplicate ID */ + if (cursor) { + if (pc->get_dupecache_index()) + op->set_referenced_dupe(pc->get_dupecache_index()); + } + + /* the current op has no cursors attached; but if there are any + * other ops in this node and in this transaction, then they have to + * be set to nil. This only nil's txn-cursors! */ + nil_all_cursors_in_node(context->txn, pc, node); + + /* in addition we nil all btree cursors which are coupled to this key */ + nil_all_cursors_in_btree(context, pc, node->get_key()); + + /* append journal entry */ + if (m_env->get_flags() & HAM_ENABLE_RECOVERY + && m_env->get_flags() & HAM_ENABLE_TRANSACTIONS) { + Journal *j = lenv()->journal(); + j->append_erase(this, context->txn, key, 0, + flags | HAM_ERASE_ALL_DUPLICATES, op->get_lsn()); + } + + ham_assert(st == 0); + return (0); +} + +ham_status_t +LocalDatabase::create(Context *context, PBtreeHeader *btree_header) +{ + /* set the flags; strip off run-time (per session) flags for the btree */ + uint32_t persistent_flags = get_flags(); + persistent_flags &= ~(HAM_CACHE_UNLIMITED + | HAM_DISABLE_MMAP + | HAM_ENABLE_FSYNC + | HAM_READ_ONLY + | HAM_ENABLE_RECOVERY + | HAM_AUTO_RECOVERY + | HAM_ENABLE_TRANSACTIONS); + + switch (m_config.key_type) { + case HAM_TYPE_UINT8: + m_config.key_size = 1; + break; + case HAM_TYPE_UINT16: + m_config.key_size = 2; + break; + case HAM_TYPE_REAL32: + case HAM_TYPE_UINT32: + m_config.key_size = 4; + break; + case HAM_TYPE_REAL64: + case HAM_TYPE_UINT64: + m_config.key_size = 8; + break; + } + + // if we cannot fit at least 10 keys in a page then refuse to continue + if (m_config.key_size != HAM_KEY_SIZE_UNLIMITED) { + if (lenv()->config().page_size_bytes / (m_config.key_size + 8) < 10) { + ham_trace(("key size too large; either increase page_size or decrease " + "key size")); + return (HAM_INV_KEY_SIZE); + } + } + + // fixed length records: + // + // if records are <= 8 bytes OR if we can fit at least 500 keys AND + // records into the leaf then store the records in the leaf; + // otherwise they're allocated as a blob + if (m_config.record_size != HAM_RECORD_SIZE_UNLIMITED) { + if (m_config.record_size <= 8 + || (m_config.record_size <= kInlineRecordThreshold + && lenv()->config().page_size_bytes + / (m_config.key_size + m_config.record_size) > 500)) { + persistent_flags |= HAM_FORCE_RECORDS_INLINE; + m_config.flags |= HAM_FORCE_RECORDS_INLINE; + } + } + + // create the btree + m_btree_index.reset(new BtreeIndex(this, btree_header, persistent_flags, + m_config.key_type, m_config.key_size)); + + /* initialize the btree */ + m_btree_index->create(context, m_config.key_type, m_config.key_size, + m_config.record_size); + + /* the header page is now dirty */ + Page *header = lenv()->page_manager()->fetch(context, 0); + header->set_dirty(true); + + /* and the TransactionIndex */ + m_txn_index.reset(new TransactionIndex(this)); + + return (0); +} + +ham_status_t +LocalDatabase::open(Context *context, PBtreeHeader *btree_header) +{ + /* + * set the database flags; strip off the persistent flags that may have been + * set by the caller, before mixing in the persistent flags as obtained + * from the btree. + */ + uint32_t flags = get_flags(); + flags &= ~(HAM_CACHE_UNLIMITED + | HAM_DISABLE_MMAP + | HAM_ENABLE_FSYNC + | HAM_READ_ONLY + | HAM_ENABLE_RECOVERY + | HAM_AUTO_RECOVERY + | HAM_ENABLE_TRANSACTIONS); + + m_config.key_type = btree_header->get_key_type(); + m_config.key_size = btree_header->get_key_size(); + + /* create the BtreeIndex */ + m_btree_index.reset(new BtreeIndex(this, btree_header, + flags | btree_header->get_flags(), + btree_header->get_key_type(), + btree_header->get_key_size())); + + ham_assert(!(m_btree_index->get_flags() & HAM_CACHE_UNLIMITED)); + ham_assert(!(m_btree_index->get_flags() & HAM_DISABLE_MMAP)); + ham_assert(!(m_btree_index->get_flags() & HAM_ENABLE_FSYNC)); + ham_assert(!(m_btree_index->get_flags() & HAM_READ_ONLY)); + ham_assert(!(m_btree_index->get_flags() & HAM_ENABLE_RECOVERY)); + ham_assert(!(m_btree_index->get_flags() & HAM_AUTO_RECOVERY)); + ham_assert(!(m_btree_index->get_flags() & HAM_ENABLE_TRANSACTIONS)); + + /* initialize the btree */ + m_btree_index->open(); + + /* create the TransactionIndex - TODO only if txn's are enabled? */ + m_txn_index.reset(new TransactionIndex(this)); + + /* merge the non-persistent database flag with the persistent flags from + * the btree index */ + m_config.flags = config().flags | m_btree_index->get_flags(); + m_config.key_size = m_btree_index->get_key_size(); + m_config.key_type = m_btree_index->get_key_type(); + m_config.record_size = m_btree_index->get_record_size(); + + // fetch the current record number + if ((get_flags() & (HAM_RECORD_NUMBER32 | HAM_RECORD_NUMBER64))) { + ham_key_t key = {}; + Cursor *c = new Cursor(this, 0, 0); + ham_status_t st = cursor_move_impl(context, c, &key, 0, HAM_CURSOR_LAST); + cursor_close(c); + if (st) + return (st == HAM_KEY_NOT_FOUND ? 0 : st); + + if (get_flags() & HAM_RECORD_NUMBER32) + m_recno = *(uint32_t *)key.data; + else + m_recno = *(uint64_t *)key.data; + } + + return (0); +} + +struct MetricsVisitor : public BtreeVisitor { + MetricsVisitor(ham_env_metrics_t *metrics) + : m_metrics(metrics) { + } + + // Specifies if the visitor modifies the node + virtual bool is_read_only() const { + return (true); + } + + // called for each node + virtual void operator()(Context *context, BtreeNodeProxy *node) { + if (node->is_leaf()) + node->fill_metrics(&m_metrics->btree_leaf_metrics); + else + node->fill_metrics(&m_metrics->btree_internal_metrics); + } + + ham_env_metrics_t *m_metrics; +}; + +void +LocalDatabase::fill_metrics(ham_env_metrics_t *metrics) +{ + metrics->btree_leaf_metrics.database_name = name(); + metrics->btree_internal_metrics.database_name = name(); + + try { + MetricsVisitor visitor(metrics); + Context context(lenv(), 0, this); + m_btree_index->visit_nodes(&context, visitor, true); + + // calculate the "avg" values + BtreeStatistics::finalize_metrics(&metrics->btree_leaf_metrics); + BtreeStatistics::finalize_metrics(&metrics->btree_internal_metrics); + } + catch (Exception &) { + } +} + +ham_status_t +LocalDatabase::get_parameters(ham_parameter_t *param) +{ + try { + Context context(lenv(), 0, this); + + Page *page = 0; + ham_parameter_t *p = param; + + if (p) { + for (; p->name; p++) { + switch (p->name) { + case HAM_PARAM_KEY_SIZE: + p->value = m_config.key_size; + break; + case HAM_PARAM_KEY_TYPE: + p->value = m_config.key_type; + break; + case HAM_PARAM_RECORD_SIZE: + p->value = m_config.record_size; + break; + case HAM_PARAM_FLAGS: + p->value = (uint64_t)get_flags(); + break; + case HAM_PARAM_DATABASE_NAME: + p->value = (uint64_t)name(); + break; + case HAM_PARAM_MAX_KEYS_PER_PAGE: + p->value = 0; + page = lenv()->page_manager()->fetch(&context, + m_btree_index->get_root_address(), + PageManager::kReadOnly); + if (page) { + BtreeNodeProxy *node = m_btree_index->get_node_from_page(page); + p->value = node->estimate_capacity(); + } + break; + case HAM_PARAM_RECORD_COMPRESSION: + p->value = 0; + break; + case HAM_PARAM_KEY_COMPRESSION: + p->value = 0; + break; + default: + ham_trace(("unknown parameter %d", (int)p->name)); + throw Exception(HAM_INV_PARAMETER); + } + } + } + } + catch (Exception &ex) { + return (ex.code); + } + return (0); +} + +ham_status_t +LocalDatabase::check_integrity(uint32_t flags) +{ + try { + Context context(lenv(), 0, this); + + /* purge cache if necessary */ + lenv()->page_manager()->purge_cache(&context); + + /* call the btree function */ + m_btree_index->check_integrity(&context, flags); + + /* call the txn function */ + //m_txn_index->check_integrity(flags); + } + catch (Exception &ex) { + return (ex.code); + } + return (0); +} + +ham_status_t +LocalDatabase::count(Transaction *htxn, bool distinct, uint64_t *pcount) +{ + LocalTransaction *txn = dynamic_cast<LocalTransaction *>(htxn); + + try { + Context context(lenv(), txn, this); + + /* purge cache if necessary */ + lenv()->page_manager()->purge_cache(&context); + + /* + * call the btree function - this will retrieve the number of keys + * in the btree + */ + uint64_t keycount = m_btree_index->count(&context, distinct); + + /* + * if transactions are enabled, then also sum up the number of keys + * from the transaction tree + */ + if (get_flags() & HAM_ENABLE_TRANSACTIONS) + keycount += m_txn_index->count(&context, txn, distinct); + + *pcount = keycount; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::scan(Transaction *txn, ScanVisitor *visitor, bool distinct) +{ + ham_status_t st = 0; + + try { + Context context(lenv(), (LocalTransaction *)txn, this); + + Page *page; + ham_key_t key = {0}; + + /* purge cache if necessary */ + lenv()->page_manager()->purge_cache(&context); + + /* create a cursor, move it to the first key */ + Cursor *cursor = cursor_create_impl(txn, 0); + + st = cursor_move_impl(&context, cursor, &key, 0, HAM_CURSOR_FIRST); + if (st) + goto bail; + + /* only transaction keys? then use a regular cursor */ + if (!cursor->is_coupled_to_btree()) { + do { + /* process the key */ + (*visitor)(key.data, key.size, distinct + ? cursor->get_record_count(&context, 0) + : 1); + } while ((st = cursor_move_impl(&context, cursor, &key, + 0, HAM_CURSOR_NEXT)) == 0); + goto bail; + } + + /* only btree keys? then traverse page by page */ + if (!(get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_assert(cursor->is_coupled_to_btree()); + + do { + // get the coupled page + cursor->get_btree_cursor()->get_coupled_key(&page); + BtreeNodeProxy *node = m_btree_index->get_node_from_page(page); + // and let the btree node perform the remaining work + node->scan(&context, visitor, 0, distinct); + } while (cursor->get_btree_cursor()->move_to_next_page(&context) == 0); + + goto bail; + } + + /* mixed txn/btree load? if there are btree nodes which are NOT modified + * in transactions then move the scan to the btree node. Otherwise use + * a regular cursor */ + while (true) { + if (!cursor->is_coupled_to_btree()) + break; + + int slot; + cursor->get_btree_cursor()->get_coupled_key(&page, &slot); + BtreeNodeProxy *node = m_btree_index->get_node_from_page(page); + + /* are transactions present? then check if the next txn key is >= btree[0] + * and <= btree[n] */ + ham_key_t *txnkey = 0; + if (cursor->get_txn_cursor()->get_coupled_op()) + txnkey = cursor->get_txn_cursor()->get_coupled_op()->get_node()->get_key(); + // no (more) transactional keys left - process the current key, then + // scan the remaining keys directly in the btree + if (!txnkey) { + /* process the key */ + (*visitor)(key.data, key.size, distinct + ? cursor->get_record_count(&context, 0) + : 1); + break; + } + + /* if yes: use the cursor to traverse the page */ + if (node->compare(&context, txnkey, 0) >= 0 + && node->compare(&context, txnkey, node->get_count() - 1) <= 0) { + do { + Page *new_page = 0; + if (cursor->is_coupled_to_btree()) + cursor->get_btree_cursor()->get_coupled_key(&new_page); + /* break the loop if we've reached the next page */ + if (new_page && new_page != page) { + page = new_page; + break; + } + /* process the key */ + (*visitor)(key.data, key.size, distinct + ? cursor->get_record_count(&context, 0) + : 1); + } while ((st = cursor_move_impl(&context, cursor, &key, + 0, HAM_CURSOR_NEXT)) == 0); + + if (st != HAM_SUCCESS) + goto bail; + } + else { + /* Otherwise traverse directly in the btree page. This is the fastest + * code path. */ + node->scan(&context, visitor, slot, distinct); + /* and then move to the next page */ + if (cursor->get_btree_cursor()->move_to_next_page(&context) != 0) + break; + } + } + + /* pick up the remaining transactional keys */ + while ((st = cursor_move_impl(&context, cursor, &key, + 0, HAM_CURSOR_NEXT)) == 0) { + (*visitor)(key.data, key.size, distinct + ? cursor->get_record_count(&context, 0) + : 1); + } + +bail: + if (cursor) + cursor_close_impl(cursor); + return (st == HAM_KEY_NOT_FOUND ? 0 : st); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::insert(Cursor *cursor, Transaction *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + Context context(lenv(), (LocalTransaction *)txn, this); + + try { + if (m_config.flags & (HAM_RECORD_NUMBER32 | HAM_RECORD_NUMBER64)) { + if (key->size == 0 && key->data == 0) { + // ok! + } + else if (key->size == 0 && key->data != 0) { + ham_trace(("for record number keys set key size to 0, " + "key->data to null")); + return (HAM_INV_PARAMETER); + } + else if (key->size != m_config.key_size) { + ham_trace(("invalid key size (%u instead of %u)", + key->size, m_config.key_size)); + return (HAM_INV_KEY_SIZE); + } + } + else if (m_config.key_size != HAM_KEY_SIZE_UNLIMITED + && key->size != m_config.key_size) { + ham_trace(("invalid key size (%u instead of %u)", + key->size, m_config.key_size)); + return (HAM_INV_KEY_SIZE); + } + if (m_config.record_size != HAM_RECORD_SIZE_UNLIMITED + && record->size != m_config.record_size) { + ham_trace(("invalid record size (%u instead of %u)", + record->size, m_config.record_size)); + return (HAM_INV_RECORD_SIZE); + } + + ByteArray *arena = &key_arena(txn); + + /* + * record number: make sure that we have a valid key structure, + * and lazy load the last used record number + * + * TODO TODO + * too much duplicated code + */ + uint64_t recno = 0; + if (get_flags() & HAM_RECORD_NUMBER64) { + if (flags & HAM_OVERWRITE) { + ham_assert(key->size == sizeof(uint64_t)); + ham_assert(key->data != 0); + recno = *(uint64_t *)key->data; + } + else { + /* get the record number and increment it */ + recno = next_record_number(); + } + + /* allocate memory for the key */ + if (!key->data) { + arena->resize(sizeof(uint64_t)); + key->data = arena->get_ptr(); + } + key->size = sizeof(uint64_t); + *(uint64_t *)key->data = recno; + + /* A recno key is always appended sequentially */ + flags |= HAM_HINT_APPEND; + } + else if (get_flags() & HAM_RECORD_NUMBER32) { + if (flags & HAM_OVERWRITE) { + ham_assert(key->size == sizeof(uint32_t)); + ham_assert(key->data != 0); + recno = *(uint32_t *)key->data; + } + else { + /* get the record number and increment it */ + recno = next_record_number(); + } + + /* allocate memory for the key */ + if (!key->data) { + arena->resize(sizeof(uint32_t)); + key->data = arena->get_ptr(); + } + key->size = sizeof(uint32_t); + *(uint32_t *)key->data = (uint32_t)recno; + + /* A recno key is always appended sequentially */ + flags |= HAM_HINT_APPEND; + } + + ham_status_t st = 0; + LocalTransaction *local_txn = 0; + + /* purge cache if necessary */ + if (!txn && (get_flags() & HAM_ENABLE_TRANSACTIONS)) { + local_txn = begin_temp_txn(); + context.txn = local_txn; + } + + st = insert_impl(&context, cursor, key, record, flags); + return (finalize(&context, st, local_txn)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::erase(Cursor *cursor, Transaction *txn, ham_key_t *key, + uint32_t flags) +{ + Context context(lenv(), (LocalTransaction *)txn, this); + + try { + ham_status_t st = 0; + LocalTransaction *local_txn = 0; + + if (cursor) { + if (cursor->is_nil()) + throw Exception(HAM_CURSOR_IS_NIL); + if (cursor->is_coupled_to_txnop()) // TODO rewrite the next line, it's ugly + key = cursor->get_txn_cursor()->get_coupled_op()->get_node()->get_key(); + else // cursor->is_coupled_to_btree() + key = 0; + } + + if (key) { + if (m_config.key_size != HAM_KEY_SIZE_UNLIMITED + && key->size != m_config.key_size) { + ham_trace(("invalid key size (%u instead of %u)", + key->size, m_config.key_size)); + return (HAM_INV_KEY_SIZE); + } + } + + if (!txn && (get_flags() & HAM_ENABLE_TRANSACTIONS)) { + local_txn = begin_temp_txn(); + context.txn = local_txn; + } + + st = erase_impl(&context, cursor, key, flags); + return (finalize(&context, st, local_txn)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::find(Cursor *cursor, Transaction *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + Context context(lenv(), (LocalTransaction *)txn, this); + + try { + ham_status_t st = 0; + + /* Duplicates AND Transactions require a Cursor because only + * Cursors can build lists of duplicates. + * TODO not exception safe - if find() throws then the cursor is not closed + */ + if (!cursor + && (get_flags() & (HAM_ENABLE_DUPLICATE_KEYS|HAM_ENABLE_TRANSACTIONS))) { + Cursor *c = cursor_create_impl(txn, 0); + st = find(c, txn, key, record, flags); + cursor_close_impl(c); + delete c; + return (st); + } + + if (m_config.key_size != HAM_KEY_SIZE_UNLIMITED + && key->size != m_config.key_size) { + ham_trace(("invalid key size (%u instead of %u)", + key->size, m_config.key_size)); + return (HAM_INV_KEY_SIZE); + } + + // cursor: reset the dupecache, set to nil + // TODO merge both calls, only set to nil if find() was successful + if (cursor) { + cursor->clear_dupecache(); + cursor->set_to_nil(Cursor::kBoth); + } + + st = find_impl(&context, cursor, key, record, flags); + if (st) + return (finalize(&context, st, 0)); + + if (cursor) { + // make sure that txn-cursor and btree-cursor point to the same keys + if (get_flags() & HAM_ENABLE_TRANSACTIONS) { + bool is_equal; + (void)cursor->sync(&context, Cursor::kSyncOnlyEqualKeys, &is_equal); + if (!is_equal && cursor->is_coupled_to_txnop()) + cursor->set_to_nil(Cursor::kBtree); + } + + /* if the key has duplicates: build a duplicate table, then couple to the + * first/oldest duplicate */ + if (get_flags() & HAM_ENABLE_DUPLICATES) + cursor->clear_dupecache(); + + if (cursor->get_dupecache_count(&context)) { + DupeCacheLine *e = cursor->get_dupecache()->get_first_element(); + if (e->use_btree()) + cursor->couple_to_btree(); + else + cursor->couple_to_txnop(); + cursor->couple_to_dupe(1); // 1-based index! + if (record) { // TODO don't copy record if it was already + // copied in find_impl + if (cursor->is_coupled_to_txnop()) + cursor->get_txn_cursor()->copy_coupled_record(record); + else { + Transaction *txn = cursor->get_txn(); + st = cursor->get_btree_cursor()->move(&context, 0, 0, record, + &record_arena(txn), 0); + } + } + } + + /* set a flag that the cursor just completed an Insert-or-find + * operation; this information is needed in ham_cursor_move */ + cursor->set_lastop(Cursor::kLookupOrInsert); + } + + return (finalize(&context, st, 0)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +Cursor * +LocalDatabase::cursor_create_impl(Transaction *txn, uint32_t flags) +{ + return (new Cursor(this, txn, flags)); +} + +Cursor * +LocalDatabase::cursor_clone_impl(Cursor *src) +{ + return (new Cursor(*src)); +} + +ham_status_t +LocalDatabase::cursor_get_record_count(Cursor *cursor, uint32_t flags, + uint32_t *pcount) +{ + try { + Context context(lenv(), (LocalTransaction *)cursor->get_txn(), this); + *pcount = cursor->get_record_count(&context, flags); + return (0); + } + catch (Exception &ex) { + *pcount = 0; + return (ex.code); + } +} + +ham_status_t +LocalDatabase::cursor_get_duplicate_position(Cursor *cursor, + uint32_t *pposition) +{ + try { + *pposition = cursor->get_duplicate_position(); + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::cursor_get_record_size(Cursor *cursor, uint64_t *psize) +{ + try { + Context context(lenv(), (LocalTransaction *)cursor->get_txn(), this); + *psize = cursor->get_record_size(&context); + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::cursor_overwrite(Cursor *cursor, + ham_record_t *record, uint32_t flags) +{ + Context context(lenv(), (LocalTransaction *)cursor->get_txn(), this); + + try { + ham_status_t st = 0; + Transaction *local_txn = 0; + + /* purge cache if necessary */ + lenv()->page_manager()->purge_cache(&context); + + /* if user did not specify a transaction, but transactions are enabled: + * create a temporary one */ + if (!cursor->get_txn() && (get_flags() & HAM_ENABLE_TRANSACTIONS)) { + local_txn = begin_temp_txn(); + context.txn = (LocalTransaction *)local_txn; + } + + /* this function will do all the work */ + st = cursor->overwrite(&context, cursor->get_txn() + ? cursor->get_txn() + : local_txn, + record, flags); + return (finalize(&context, st, local_txn)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::cursor_move(Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + try { + Context context(lenv(), (LocalTransaction *)cursor->get_txn(), + this); + + return (cursor_move_impl(&context, cursor, key, record, flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +LocalDatabase::cursor_move_impl(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags) +{ + /* purge cache if necessary */ + lenv()->page_manager()->purge_cache(context); + + /* + * if the cursor was never used before and the user requests a NEXT then + * move the cursor to FIRST; if the user requests a PREVIOUS we set it + * to LAST, resp. + * + * if the cursor was already used but is nil then we've reached EOF, + * and a NEXT actually tries to move to the LAST key (and PREVIOUS + * moves to FIRST) + * + * TODO the btree-cursor has identical code which can be removed + */ + if (cursor->is_nil(0)) { + if (flags & HAM_CURSOR_NEXT) { + flags &= ~HAM_CURSOR_NEXT; + if (cursor->is_first_use()) + flags |= HAM_CURSOR_FIRST; + else + flags |= HAM_CURSOR_LAST; + } + else if (flags & HAM_CURSOR_PREVIOUS) { + flags &= ~HAM_CURSOR_PREVIOUS; + if (cursor->is_first_use()) + flags |= HAM_CURSOR_LAST; + else + flags |= HAM_CURSOR_FIRST; + } + } + + ham_status_t st = 0; + + /* in non-transactional mode - just call the btree function and return */ + if (!(get_flags() & HAM_ENABLE_TRANSACTIONS)) { + return (cursor->get_btree_cursor()->move(context, + key, &key_arena(context->txn), + record, &record_arena(context->txn), flags)); + } + + /* everything else is handled by the cursor function */ + st = cursor->move(context, key, record, flags); + + /* store the direction */ + if (flags & HAM_CURSOR_NEXT) + cursor->set_lastop(HAM_CURSOR_NEXT); + else if (flags & HAM_CURSOR_PREVIOUS) + cursor->set_lastop(HAM_CURSOR_PREVIOUS); + else + cursor->set_lastop(0); + + if (st) { + if (st == HAM_KEY_ERASED_IN_TXN) + st = HAM_KEY_NOT_FOUND; + /* trigger a sync when the function is called again */ + cursor->set_lastop(0); + return (st); + } + + return (0); +} + +void +LocalDatabase::cursor_close_impl(Cursor *cursor) +{ + cursor->close(); +} + +ham_status_t +LocalDatabase::close_impl(uint32_t flags) +{ + Context context(lenv(), 0, this); + + /* check if this database is modified by an active transaction */ + if (m_txn_index) { + TransactionNode *node = m_txn_index->get_first(); + while (node) { + TransactionOperation *op = node->get_newest_op(); + while (op) { + Transaction *optxn = op->get_txn(); + if (!optxn->is_committed() && !optxn->is_aborted()) { + ham_trace(("cannot close a Database that is modified by " + "a currently active Transaction")); + return (set_error(HAM_TXN_STILL_OPEN)); + } + op = op->get_previous_in_node(); + } + node = node->get_next_sibling(); + } + } + + /* in-memory-database: free all allocated blobs */ + if (m_btree_index && m_env->get_flags() & HAM_IN_MEMORY) + m_btree_index->release(&context); + + /* + * flush all pages of this database (but not the header page, + * it's still required and will be flushed below) + */ + lenv()->page_manager()->close_database(&context, this); + + return (0); +} + +void +LocalDatabase::increment_dupe_index(Context *context, TransactionNode *node, + Cursor *skip, uint32_t start) +{ + Cursor *c = m_cursor_list; + + while (c) { + bool hit = false; + + if (c == skip || c->is_nil(0)) + goto next; + + /* if cursor is coupled to an op in the same node: increment + * duplicate index (if required) */ + if (c->is_coupled_to_txnop()) { + TransactionCursor *txnc = c->get_txn_cursor(); + TransactionNode *n = txnc->get_coupled_op()->get_node(); + if (n == node) + hit = true; + } + /* if cursor is coupled to the same key in the btree: increment + * duplicate index (if required) */ + else if (c->get_btree_cursor()->points_to(context, node->get_key())) { + hit = true; + } + + if (hit) { + if (c->get_dupecache_index() > start) + c->set_dupecache_index(c->get_dupecache_index() + 1); + } + +next: + c = c->get_next(); + } +} + +void +LocalDatabase::nil_all_cursors_in_node(LocalTransaction *txn, Cursor *current, + TransactionNode *node) +{ + TransactionOperation *op = node->get_newest_op(); + while (op) { + TransactionCursor *cursor = op->cursor_list(); + while (cursor) { + Cursor *parent = cursor->get_parent(); + // is the current cursor to a duplicate? then adjust the + // coupled duplicate index of all cursors which point to a duplicate + if (current) { + if (current->get_dupecache_index()) { + if (current->get_dupecache_index() < parent->get_dupecache_index()) { + parent->set_dupecache_index(parent->get_dupecache_index() - 1); + cursor = cursor->get_coupled_next(); + continue; + } + else if (current->get_dupecache_index() > parent->get_dupecache_index()) { + cursor = cursor->get_coupled_next(); + continue; + } + // else fall through + } + } + parent->couple_to_btree(); // TODO merge these two lines + parent->set_to_nil(Cursor::kTxn); + // set a flag that the cursor just completed an Insert-or-find + // operation; this information is needed in ham_cursor_move + // (in this aspect, an erase is the same as insert/find) + parent->set_lastop(Cursor::kLookupOrInsert); + + cursor = op->cursor_list(); + } + + op = op->get_previous_in_node(); + } +} + +ham_status_t +LocalDatabase::copy_record(LocalDatabase *db, Transaction *txn, + TransactionOperation *op, ham_record_t *record) +{ + ByteArray *arena = &db->record_arena(txn); + + if (!(record->flags & HAM_RECORD_USER_ALLOC)) { + arena->resize(op->get_record()->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, op->get_record()->data, op->get_record()->size); + record->size = op->get_record()->size; + return (0); +} + +void +LocalDatabase::nil_all_cursors_in_btree(Context *context, Cursor *current, + ham_key_t *key) +{ + Cursor *c = m_cursor_list; + + /* foreach cursor in this database: + * if it's nil or coupled to the txn: skip it + * if it's coupled to btree AND uncoupled: compare keys; set to nil + * if keys are identical + * if it's uncoupled to btree AND coupled: compare keys; set to nil + * if keys are identical; (TODO - improve performance by nil'ling + * all other cursors from the same btree page) + * + * do NOT nil the current cursor - it's coupled to the key, and the + * coupled key is still needed by the caller + */ + while (c) { + if (c->is_nil(0) || c == current) + goto next; + if (c->is_coupled_to_txnop()) + goto next; + + if (c->get_btree_cursor()->points_to(context, key)) { + /* is the current cursor to a duplicate? then adjust the + * coupled duplicate index of all cursors which point to a + * duplicate */ + if (current) { + if (current->get_dupecache_index()) { + if (current->get_dupecache_index() < c->get_dupecache_index()) { + c->set_dupecache_index(c->get_dupecache_index() - 1); + goto next; + } + else if (current->get_dupecache_index() > c->get_dupecache_index()) { + goto next; + } + /* else fall through */ + } + } + c->set_to_nil(0); + } +next: + c = c->get_next(); + } +} + +ham_status_t +LocalDatabase::flush_txn_operation(Context *context, LocalTransaction *txn, + TransactionOperation *op) +{ + ham_status_t st = 0; + TransactionNode *node = op->get_node(); + + /* + * depending on the type of the operation: actually perform the + * operation on the btree + * + * if the txn-op has a cursor attached, then all (txn)cursors + * which are coupled to this op have to be uncoupled, and their + * parent (btree) cursor must be coupled to the btree item instead. + */ + if ((op->get_flags() & TransactionOperation::kInsert) + || (op->get_flags() & TransactionOperation::kInsertOverwrite) + || (op->get_flags() & TransactionOperation::kInsertDuplicate)) { + uint32_t additional_flag = + (op->get_flags() & TransactionOperation::kInsertDuplicate) + ? HAM_DUPLICATE + : HAM_OVERWRITE; + if (!op->cursor_list()) { + st = m_btree_index->insert(context, 0, node->get_key(), op->get_record(), + op->get_orig_flags() | additional_flag); + } + else { + TransactionCursor *tc1 = op->cursor_list(); + Cursor *c1 = tc1->get_parent(); + /* pick the first cursor, get the parent/btree cursor and + * insert the key/record pair in the btree. The btree cursor + * then will be coupled to this item. */ + st = m_btree_index->insert(context, c1, node->get_key(), op->get_record(), + op->get_orig_flags() | additional_flag); + if (!st) { + /* uncouple the cursor from the txn-op, and remove it */ + c1->couple_to_btree(); // TODO merge these two calls + c1->set_to_nil(Cursor::kTxn); + + /* all other (btree) cursors need to be coupled to the same + * item as the first one. */ + TransactionCursor *tc2; + while ((tc2 = op->cursor_list())) { + Cursor *c2 = tc2->get_parent(); + c2->get_btree_cursor()->clone(c1->get_btree_cursor()); + c2->couple_to_btree(); // TODO merge these two calls + c2->set_to_nil(Cursor::kTxn); + } + } + } + } + else if (op->get_flags() & TransactionOperation::kErase) { + st = m_btree_index->erase(context, 0, node->get_key(), + op->get_referenced_dupe(), op->get_flags()); + if (st == HAM_KEY_NOT_FOUND) + st = 0; + } + + return (st); +} + +ham_status_t +LocalDatabase::drop(Context *context) +{ + m_btree_index->release(context); + return (0); +} + +ham_status_t +LocalDatabase::insert_impl(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags) +{ + ham_status_t st = 0; + + lenv()->page_manager()->purge_cache(context); + + /* + * if transactions are enabled: only insert the key/record pair into + * the Transaction structure. Otherwise immediately write to the btree. + */ + if (context->txn || m_env->get_flags() & HAM_ENABLE_TRANSACTIONS) + st = insert_txn(context, key, record, flags, cursor + ? cursor->get_txn_cursor() + : 0); + else + st = m_btree_index->insert(context, cursor, key, record, flags); + + // couple the cursor to the inserted key + if (st == 0 && cursor) { + if (m_env->get_flags() & HAM_ENABLE_TRANSACTIONS) { + DupeCache *dc = cursor->get_dupecache(); + // TODO required? should have happened in insert_txn + cursor->couple_to_txnop(); + /* the cursor is coupled to the txn-op; nil the btree-cursor to + * trigger a sync() call when fetching the duplicates */ + // TODO merge with the line above + cursor->set_to_nil(Cursor::kBtree); + + /* reset the dupecache, otherwise cursor->get_dupecache_count() + * does not update the dupecache correctly */ + dc->clear(); + + /* if duplicate keys are enabled: set the duplicate index of + * the new key */ + if (st == 0 && cursor->get_dupecache_count(context)) { + TransactionOperation *op = cursor->get_txn_cursor()->get_coupled_op(); + ham_assert(op != 0); + + for (uint32_t i = 0; i < dc->get_count(); i++) { + DupeCacheLine *l = dc->get_element(i); + if (!l->use_btree() && l->get_txn_op() == op) { + cursor->set_dupecache_index(i + 1); + break; + } + } + } + } + else { + // TODO required? should have happened in BtreeInsertAction + cursor->couple_to_btree(); + } + + /* set a flag that the cursor just completed an Insert-or-find + * operation; this information is needed in ham_cursor_move */ + cursor->set_lastop(Cursor::kLookupOrInsert); + } + + return (st); +} + +ham_status_t +LocalDatabase::find_impl(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags) +{ + /* purge cache if necessary */ + lenv()->page_manager()->purge_cache(context); + + /* + * if transactions are enabled: read keys from transaction trees, + * otherwise read immediately from disk + */ + if (context->txn || m_env->get_flags() & HAM_ENABLE_TRANSACTIONS) + return (find_txn(context, cursor, key, record, flags)); + + return (m_btree_index->find(context, cursor, key, &key_arena(context->txn), + record, &record_arena(context->txn), flags)); +} + +ham_status_t +LocalDatabase::erase_impl(Context *context, Cursor *cursor, ham_key_t *key, + uint32_t flags) +{ + ham_status_t st = 0; + + /* + * if transactions are enabled: append a 'erase key' operation into + * the txn tree; otherwise immediately erase the key from disk + */ + if (context->txn || m_env->get_flags() & HAM_ENABLE_TRANSACTIONS) { + if (cursor) { + /* + * !! + * we have two cases: + * + * 1. the cursor is coupled to a btree item (or uncoupled, but not nil) + * and the txn_cursor is nil; in that case, we have to + * - uncouple the btree cursor + * - insert the erase-op for the key which is used by the btree cursor + * + * 2. the cursor is coupled to a txn-op; in this case, we have to + * - insert the erase-op for the key which is used by the txn-op + * + * TODO clean up this whole mess. code should be like + * + * if (txn) + * erase_txn(txn, cursor->get_key(), 0, cursor->get_txn_cursor()); + */ + /* case 1 described above */ + if (cursor->is_coupled_to_btree()) { + cursor->set_to_nil(Cursor::kTxn); + cursor->get_btree_cursor()->uncouple_from_page(context); + st = erase_txn(context, cursor->get_btree_cursor()->get_uncoupled_key(), + 0, cursor->get_txn_cursor()); + } + /* case 2 described above */ + else { + // TODO this line is ugly + st = erase_txn(context, + cursor->get_txn_cursor()->get_coupled_op()->get_key(), + 0, cursor->get_txn_cursor()); + } + } + else { + st = erase_txn(context, key, flags, 0); + } + } + else { + st = m_btree_index->erase(context, cursor, key, 0, flags); + } + + /* on success: verify that cursor is now nil */ + if (cursor && st == 0) { + cursor->set_to_nil(0); + cursor->couple_to_btree(); // TODO why? + ham_assert(cursor->get_txn_cursor()->is_nil()); + ham_assert(cursor->is_nil(0)); + cursor->clear_dupecache(); // TODO merge with set_to_nil() + } + + return (st); +} + +ham_status_t +LocalDatabase::finalize(Context *context, ham_status_t status, + Transaction *local_txn) +{ + LocalEnvironment *env = lenv(); + + if (status) { + if (local_txn) { + context->changeset.clear(); + env->txn_manager()->abort(local_txn); + } + return (status); + } + + if (local_txn) { + context->changeset.clear(); + env->txn_manager()->commit(local_txn); + } + else if (env->get_flags() & HAM_ENABLE_RECOVERY + && !(env->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + context->changeset.flush(env->next_lsn()); + } + return (0); +} + +LocalTransaction * +LocalDatabase::begin_temp_txn() +{ + LocalTransaction *txn; + ham_status_t st = lenv()->txn_begin((Transaction **)&txn, 0, + HAM_TXN_TEMPORARY | HAM_DONT_LOCK); + if (st) + throw Exception(st); + return (txn); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4db/db_local.h b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_local.h new file mode 100644 index 0000000000..0d08bd79ed --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_local.h @@ -0,0 +1,278 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: no + */ + +#ifndef HAM_DB_LOCAL_H +#define HAM_DB_LOCAL_H + +#include "0root/root.h" + +#include <limits> + +// Always verify that a file of level N does not include headers > N! +#include "1base/scoped_ptr.h" +#include "3btree/btree_index.h" +#include "4txn/txn_local.h" +#include "4db/db.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class TransactionNode; +class TransactionIndex; +class TransactionCursor; +class TransactionOperation; +class LocalEnvironment; +class LocalTransaction; + +template<typename T> +class RecordNumberFixture; + +// +// The database implementation for local file access +// +class LocalDatabase : public Database { + public: + enum { + // The default threshold for inline records + kInlineRecordThreshold = 32 + }; + + // Constructor + LocalDatabase(Environment *env, DatabaseConfiguration &config) + : Database(env, config), m_recno(0), m_cmp_func(0) { + } + + // Returns the btree index + BtreeIndex *btree_index() { + return (m_btree_index.get()); + } + + // Returns the transactional index + TransactionIndex *txn_index() { + return (m_txn_index.get()); + } + + // Returns the LocalEnvironment instance + LocalEnvironment *lenv() { + return ((LocalEnvironment *)m_env); + } + + // Creates a new Database + ham_status_t create(Context *context, PBtreeHeader *btree_header); + + // Opens an existing Database + ham_status_t open(Context *context, PBtreeHeader *btree_header); + + // Erases this Database + ham_status_t drop(Context *context); + + // Fills in the current metrics + virtual void fill_metrics(ham_env_metrics_t *metrics); + + // Returns Database parameters (ham_db_get_parameters) + virtual ham_status_t get_parameters(ham_parameter_t *param); + + // Checks Database integrity (ham_db_check_integrity) + virtual ham_status_t check_integrity(uint32_t flags); + + // Returns the number of keys + virtual ham_status_t count(Transaction *txn, bool distinct, + uint64_t *pcount); + + // Scans the whole database, applies a processor function + virtual ham_status_t scan(Transaction *txn, ScanVisitor *visitor, + bool distinct); + + // Inserts a key/value pair (ham_db_insert, ham_cursor_insert) + virtual ham_status_t insert(Cursor *cursor, Transaction *txn, + ham_key_t *key, ham_record_t *record, uint32_t flags); + + // Erase a key/value pair (ham_db_erase, ham_cursor_erase) + virtual ham_status_t erase(Cursor *cursor, Transaction *txn, ham_key_t *key, + uint32_t flags); + + // Lookup of a key/value pair (ham_db_find, ham_cursor_find) + virtual ham_status_t find(Cursor *cursor, Transaction *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags); + + // Returns number of duplicates (ham_cursor_get_record_count) + virtual ham_status_t cursor_get_record_count(Cursor *cursor, uint32_t flags, + uint32_t *pcount); + + // Returns position in duplicate list (ham_cursor_get_duplicate_position) + virtual ham_status_t cursor_get_duplicate_position(Cursor *cursor, + uint32_t *pposition); + + // Get current record size (ham_cursor_get_record_size) + virtual ham_status_t cursor_get_record_size(Cursor *cursor, + uint64_t *psize); + + // Overwrites the record of a cursor (ham_cursor_overwrite) + virtual ham_status_t cursor_overwrite(Cursor *cursor, + ham_record_t *record, uint32_t flags); + + // Moves a cursor, returns key and/or record (ham_cursor_move) + virtual ham_status_t cursor_move(Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags); + + // Inserts a key/record pair in a txn node; if cursor is not NULL it will + // be attached to the new txn_op structure + // TODO this should be private + ham_status_t insert_txn(Context *context, ham_key_t *key, + ham_record_t *record, uint32_t flags, + TransactionCursor *cursor); + + // Returns the default comparison function + ham_compare_func_t compare_func() { + return (m_cmp_func); + } + + // Sets the default comparison function (ham_db_set_compare_func) + ham_status_t set_compare_func(ham_compare_func_t f) { + if (m_config.key_type != HAM_TYPE_CUSTOM) { + ham_trace(("ham_set_compare_func only allowed for HAM_TYPE_CUSTOM " + "databases!")); + return (HAM_INV_PARAMETER); + } + m_cmp_func = f; + return (0); + } + + // Flushes a TransactionOperation to the btree + // TODO should be private + ham_status_t flush_txn_operation(Context *context, LocalTransaction *txn, + TransactionOperation *op); + + protected: + friend class Cursor; + + // Copies the ham_record_t structure from |op| into |record| + static ham_status_t copy_record(LocalDatabase *db, Transaction *txn, + TransactionOperation *op, ham_record_t *record); + + // Creates a cursor; this is the actual implementation + virtual Cursor *cursor_create_impl(Transaction *txn, uint32_t flags); + + // Clones a cursor; this is the actual implementation + virtual Cursor *cursor_clone_impl(Cursor *src); + + // Closes a cursor; this is the actual implementation + virtual void cursor_close_impl(Cursor *c); + + // Closes a database; this is the actual implementation + virtual ham_status_t close_impl(uint32_t flags); + + private: + friend struct DbFixture; + friend struct HamsterdbFixture; + friend struct ExtendedKeyFixture; + friend class RecordNumberFixture<uint32_t>; + friend class RecordNumberFixture<uint64_t>; + + // Erases a key/record pair from a txn; on success, cursor will be set to + // nil + ham_status_t erase_txn(Context *context, ham_key_t *key, uint32_t flags, + TransactionCursor *cursor); + + // Lookup of a key/record pair in the Transaction index and in the btree, + // if transactions are disabled/not successful; copies the + // record into |record|. Also performs approx. matching. + ham_status_t find_txn(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags); + + // Moves a cursor, returns key and/or record (ham_cursor_move) + ham_status_t cursor_move_impl(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags); + + // The actual implementation of insert() + ham_status_t insert_impl(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags); + + // The actual implementation of find() + ham_status_t find_impl(Context *context, Cursor *cursor, + ham_key_t *key, ham_record_t *record, uint32_t flags); + + // The actual implementation of erase() + ham_status_t erase_impl(Context *context, Cursor *cursor, + ham_key_t *key, uint32_t flags); + + // Finalizes an operation by committing or aborting the |local_txn| + // and clearing or flushing the Changeset. + // Returns |status|. + ham_status_t finalize(Context *context, ham_status_t status, + Transaction *local_txn); + + // Begins a new temporary Transaction + LocalTransaction *begin_temp_txn(); + + // returns the next record number + uint64_t next_record_number() { + m_recno++; + if (m_config.flags & HAM_RECORD_NUMBER32 + && m_recno > std::numeric_limits<uint32_t>::max()) + throw Exception(HAM_LIMITS_REACHED); + else if (m_recno == 0) + throw Exception(HAM_LIMITS_REACHED); + return (m_recno); + } + + // Checks if an insert operation conflicts with another txn; this is the + // case if the same key is modified by another active txn. + ham_status_t check_insert_conflicts(Context *context, TransactionNode *node, + ham_key_t *key, uint32_t flags); + + // Checks if an erase operation conflicts with another txn; this is the + // case if the same key is modified by another active txn. + ham_status_t check_erase_conflicts(Context *context, TransactionNode *node, + ham_key_t *key, uint32_t flags); + + // Increments dupe index of all cursors with a dupe index > |start|; + // only cursor |skip| is ignored + void increment_dupe_index(Context *context, TransactionNode *node, + Cursor *skip, uint32_t start); + + // Sets all cursors attached to a TransactionNode to nil + void nil_all_cursors_in_node(LocalTransaction *txn, Cursor *current, + TransactionNode *node); + + // Sets all cursors to nil if they point to |key| in the btree index + void nil_all_cursors_in_btree(Context *context, Cursor *current, + ham_key_t *key); + + // the current record number + uint64_t m_recno; + + // the btree index + ScopedPtr<BtreeIndex> m_btree_index; + + // the transaction index + ScopedPtr<TransactionIndex> m_txn_index; + + // the comparison function + ham_compare_func_t m_cmp_func; +}; + +} // namespace hamsterdb + +#endif /* HAM_DB_LOCAL_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4db/db_remote.cc b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_remote.cc new file mode 100644 index 0000000000..58bd49f4db --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_remote.cc @@ -0,0 +1,635 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAM_ENABLE_REMOTE + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "1base/scoped_ptr.h" +#include "2protobuf/protocol.h" +#include "4db/db_remote.h" +#include "4env/env_remote.h" +#include "4txn/txn_remote.h" +#include "4cursor/cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +ham_status_t +RemoteDatabase::get_parameters(ham_parameter_t *param) +{ + try { + RemoteEnvironment *env = renv(); + + Protocol request(Protocol::DB_GET_PARAMETERS_REQUEST); + request.mutable_db_get_parameters_request()->set_db_handle(m_remote_handle); + + ham_parameter_t *p = param; + if (p) { + for (; p->name; p++) + request.mutable_db_get_parameters_request()->add_names(p->name); + } + + ScopedPtr<Protocol> reply(env->perform_request(&request)); + + ham_assert(reply->has_db_get_parameters_reply()); + + ham_status_t st = reply->db_get_parameters_reply().status(); + if (st) + throw Exception(st); + + p = param; + while (p && p->name) { + switch (p->name) { + case HAM_PARAM_FLAGS: + ham_assert(reply->db_get_parameters_reply().has_flags()); + p->value = reply->db_get_parameters_reply().flags(); + break; + case HAM_PARAM_KEY_SIZE: + ham_assert(reply->db_get_parameters_reply().has_key_size()); + p->value = reply->db_get_parameters_reply().key_size(); + break; + case HAM_PARAM_RECORD_SIZE: + ham_assert(reply->db_get_parameters_reply().has_record_size()); + p->value = reply->db_get_parameters_reply().record_size(); + break; + case HAM_PARAM_KEY_TYPE: + ham_assert(reply->db_get_parameters_reply().has_key_type()); + p->value = reply->db_get_parameters_reply().key_type(); + break; + case HAM_PARAM_DATABASE_NAME: + ham_assert(reply->db_get_parameters_reply().has_dbname()); + p->value = reply->db_get_parameters_reply().dbname(); + break; + case HAM_PARAM_MAX_KEYS_PER_PAGE: + ham_assert(reply->db_get_parameters_reply().has_keys_per_page()); + p->value = reply->db_get_parameters_reply().keys_per_page(); + break; + default: + ham_trace(("unknown parameter %d", (int)p->name)); + break; + } + p++; + } + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::check_integrity(uint32_t flags) +{ + try { + RemoteEnvironment *env = renv(); + + Protocol request(Protocol::DB_CHECK_INTEGRITY_REQUEST); + request.mutable_db_check_integrity_request()->set_db_handle(m_remote_handle); + request.mutable_db_check_integrity_request()->set_flags(flags); + + std::auto_ptr<Protocol> reply(env->perform_request(&request)); + + ham_assert(reply->has_db_check_integrity_reply()); + + return (reply->db_check_integrity_reply().status()); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::count(Transaction *htxn, bool distinct, uint64_t *pcount) +{ + try { + RemoteEnvironment *env = renv(); + RemoteTransaction *txn = dynamic_cast<RemoteTransaction *>(htxn); + + SerializedWrapper request; + request.id = kDbGetKeyCountRequest; + request.db_count_request.db_handle = m_remote_handle; + request.db_count_request.txn_handle = txn + ? txn->get_remote_handle() + : 0; + request.db_count_request.distinct = distinct; + + SerializedWrapper reply; + env->perform_request(&request, &reply); + + ham_assert(reply.id == kDbGetKeyCountReply); + + ham_status_t st = reply.db_count_reply.status; + if (st) + return (st); + + *pcount = reply.db_count_reply.keycount; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::insert(Cursor *cursor, Transaction *htxn, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + try { + bool send_key = true; + RemoteEnvironment *env = renv(); + RemoteTransaction *txn = dynamic_cast<RemoteTransaction *>(htxn); + + ByteArray *arena = &key_arena(txn); + + /* recno: do not send the key */ + if (get_flags() & HAM_RECORD_NUMBER32) { + send_key = false; + if (!key->data) { + arena->resize(sizeof(uint32_t)); + key->data = arena->get_ptr(); + key->size = sizeof(uint32_t); + } + } + else if (get_flags() & HAM_RECORD_NUMBER64) { + send_key = false; + if (!key->data) { + arena->resize(sizeof(uint64_t)); + key->data = arena->get_ptr(); + key->size = sizeof(uint64_t); + } + } + + SerializedWrapper request; + SerializedWrapper reply; + + if (cursor) { + SerializedWrapper request; + request.id = kCursorInsertRequest; + request.cursor_insert_request.cursor_handle = cursor->get_remote_handle(); + request.cursor_insert_request.flags = flags; + if (send_key) { + request.cursor_insert_request.has_key = true; + request.cursor_insert_request.key.has_data = true; + request.cursor_insert_request.key.data.size = key->size; + request.cursor_insert_request.key.data.value = (uint8_t *)key->data; + request.cursor_insert_request.key.flags = key->flags; + request.cursor_insert_request.key.intflags = key->_flags; + } + if (record) { + request.cursor_insert_request.has_record = true; + request.cursor_insert_request.record.has_data = true; + request.cursor_insert_request.record.data.size = record->size; + request.cursor_insert_request.record.data.value = (uint8_t *)record->data; + request.cursor_insert_request.record.flags = record->flags; + request.cursor_insert_request.record.partial_size = record->partial_size; + request.cursor_insert_request.record.partial_offset = record->partial_offset; + } + + env->perform_request(&request, &reply); + + ham_assert(reply.id == kCursorInsertReply); + + ham_status_t st = reply.cursor_insert_reply.status; + if (st) + return (st); + + if (reply.cursor_insert_reply.has_key) { + ham_assert(key->size == reply.cursor_insert_reply.key.data.size); + ham_assert(key->data != 0); + ::memcpy(key->data, reply.cursor_insert_reply.key.data.value, key->size); + } + } + else { + request.id = kDbInsertRequest; + request.db_insert_request.db_handle = m_remote_handle; + request.db_insert_request.txn_handle = txn ? txn->get_remote_handle() : 0; + request.db_insert_request.flags = flags; + if (key && !(get_flags() & (HAM_RECORD_NUMBER32 | HAM_RECORD_NUMBER64))) { + request.db_insert_request.has_key = true; + request.db_insert_request.key.has_data = true; + request.db_insert_request.key.data.size = key->size; + request.db_insert_request.key.data.value = (uint8_t *)key->data; + request.db_insert_request.key.flags = key->flags; + request.db_insert_request.key.intflags = key->_flags; + } + if (record) { + request.db_insert_request.has_record = true; + request.db_insert_request.record.has_data = true; + request.db_insert_request.record.data.size = record->size; + request.db_insert_request.record.data.value = (uint8_t *)record->data; + request.db_insert_request.record.flags = record->flags; + request.db_insert_request.record.partial_size = record->partial_size; + request.db_insert_request.record.partial_offset = record->partial_offset; + } + + env->perform_request(&request, &reply); + + ham_assert(reply.id == kDbInsertReply); + + ham_status_t st = reply.db_insert_reply.status; + if (st) + return (st); + + if (reply.db_insert_reply.has_key) { + ham_assert(key->data != 0); + ham_assert(key->size == reply.db_insert_reply.key.data.size); + ::memcpy(key->data, reply.db_insert_reply.key.data.value, key->size); + } + } + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::erase(Cursor *cursor, Transaction *htxn, ham_key_t *key, + uint32_t flags) +{ + try { + if (cursor) { + SerializedWrapper request; + request.id = kCursorEraseRequest; + request.cursor_erase_request.cursor_handle = cursor->get_remote_handle(); + request.cursor_erase_request.flags = flags; + + SerializedWrapper reply; + renv()->perform_request(&request, &reply); + ham_assert(reply.id == kCursorEraseReply); + return (reply.cursor_erase_reply.status); + } + + RemoteEnvironment *env = renv(); + RemoteTransaction *txn = dynamic_cast<RemoteTransaction *>(htxn); + + SerializedWrapper request; + request.id = kDbEraseRequest; + request.db_erase_request.db_handle = m_remote_handle; + request.db_erase_request.txn_handle = txn ? txn->get_remote_handle() : 0; + request.db_erase_request.flags = flags; + request.db_erase_request.key.has_data = true; + request.db_erase_request.key.data.size = key->size; + request.db_erase_request.key.data.value = (uint8_t *)key->data; + request.db_erase_request.key.flags = key->flags; + request.db_erase_request.key.intflags = key->_flags; + + SerializedWrapper reply; + env->perform_request(&request, &reply); + + ham_assert(reply.id == kDbEraseReply); + + return (reply.db_erase_reply.status); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::find(Cursor *cursor, Transaction *htxn, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + try { + if (cursor && !htxn) + htxn = cursor->get_txn(); + + RemoteEnvironment *env = renv(); + RemoteTransaction *txn = dynamic_cast<RemoteTransaction *>(htxn); + + SerializedWrapper request; + request.id = kDbFindRequest; + request.db_find_request.db_handle = m_remote_handle; + request.db_find_request.cursor_handle = cursor ? cursor->get_remote_handle() : 0; + request.db_find_request.txn_handle = txn ? txn->get_remote_handle() : 0; + request.db_find_request.flags = flags; + request.db_find_request.key.has_data = true; + request.db_find_request.key.data.size = key->size; + request.db_find_request.key.data.value = (uint8_t *)key->data; + request.db_find_request.key.flags = key->flags; + request.db_find_request.key.intflags = key->_flags; + if (record) { + request.db_find_request.has_record = true; + request.db_find_request.record.has_data = true; + request.db_find_request.record.data.size = record->size; + request.db_find_request.record.data.value = (uint8_t *)record->data; + request.db_find_request.record.flags = record->flags; + request.db_find_request.record.partial_size = record->partial_size; + request.db_find_request.record.partial_offset = record->partial_offset; + } + + SerializedWrapper reply; + env->perform_request(&request, &reply); + ham_assert(reply.id == kDbFindReply); + + ByteArray *pkey_arena = &key_arena(txn); + ByteArray *rec_arena = &record_arena(txn); + + ham_status_t st = reply.db_find_reply.status; + if (st == 0) { + /* approx. matching: need to copy the _flags and the key data! */ + if (reply.db_find_reply.has_key) { + ham_assert(key); + key->_flags = reply.db_find_reply.key.intflags; + key->size = (uint16_t)reply.db_find_reply.key.data.size; + if (!(key->flags & HAM_KEY_USER_ALLOC)) { + pkey_arena->resize(key->size); + key->data = pkey_arena->get_ptr(); + } + ::memcpy(key->data, (void *)reply.db_find_reply.key.data.value, + key->size); + } + if (record && reply.db_find_reply.has_record) { + record->size = reply.db_find_reply.record.data.size; + if (!(record->flags & HAM_RECORD_USER_ALLOC)) { + rec_arena->resize(record->size); + record->data = rec_arena->get_ptr(); + } + ::memcpy(record->data, (void *)reply.db_find_reply.record.data.value, + record->size); + } + } + return (st); + } + catch (Exception &ex) { + return (ex.code); + } +} + +Cursor * +RemoteDatabase::cursor_create_impl(Transaction *htxn, uint32_t flags) +{ + RemoteTransaction *txn = dynamic_cast<RemoteTransaction *>(htxn); + + SerializedWrapper request; + request.id = kCursorCreateRequest; + request.cursor_create_request.db_handle = m_remote_handle; + request.cursor_create_request.txn_handle = txn + ? txn->get_remote_handle() + : 0; + request.cursor_create_request.flags = flags; + + SerializedWrapper reply; + renv()->perform_request(&request, &reply); + ham_assert(reply.id == kCursorCreateReply); + ham_status_t st = reply.cursor_create_reply.status; + if (st) + return (0); + + Cursor *c = new Cursor((LocalDatabase *)this); // TODO this cast is evil!! + c->set_remote_handle(reply.cursor_create_reply.cursor_handle); + return (c); +} + +Cursor * +RemoteDatabase::cursor_clone_impl(Cursor *src) +{ + SerializedWrapper request; + request.id = kCursorCloneRequest; + request.cursor_clone_request.cursor_handle = src->get_remote_handle(); + + SerializedWrapper reply; + renv()->perform_request(&request, &reply); + ham_assert(reply.id == kCursorCloneReply); + ham_status_t st = reply.cursor_clone_reply.status; + if (st) + return (0); + + Cursor *c = new Cursor(src->get_db()); + c->set_remote_handle(reply.cursor_clone_reply.cursor_handle); + return (c); +} + +ham_status_t +RemoteDatabase::cursor_get_record_count(Cursor *cursor, uint32_t flags, + uint32_t *pcount) +{ + try { + RemoteEnvironment *env = renv(); + + SerializedWrapper request; + request.id = kCursorGetRecordCountRequest; + request.cursor_get_record_count_request.cursor_handle = + cursor->get_remote_handle(); + request.cursor_get_record_count_request.flags = flags; + + SerializedWrapper reply; + env->perform_request(&request, &reply); + ham_assert(reply.id == kCursorGetRecordCountReply); + + ham_status_t st = reply.cursor_get_record_count_reply.status; + if (st == 0) + *pcount = reply.cursor_get_record_count_reply.count; + else + *pcount = 0; + return (st); + } + catch (Exception &ex) { + *pcount = 0; + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::cursor_get_duplicate_position(Cursor *cursor, + uint32_t *pposition) +{ + try { + RemoteEnvironment *env = renv(); + + SerializedWrapper request; + request.id = kCursorGetDuplicatePositionRequest; + request.cursor_get_duplicate_position_request.cursor_handle = + cursor->get_remote_handle(); + + SerializedWrapper reply; + env->perform_request(&request, &reply); + ham_assert(reply.id == kCursorGetDuplicatePositionReply); + + ham_status_t st = reply.cursor_get_duplicate_position_reply.status; + if (st == 0) + *pposition = reply.cursor_get_duplicate_position_reply.position; + return (st); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::cursor_get_record_size(Cursor *cursor, uint64_t *psize) +{ + try { + RemoteEnvironment *env = renv(); + + SerializedWrapper request; + request.id = kCursorGetRecordSizeRequest; + request.cursor_get_record_size_request.cursor_handle = + cursor->get_remote_handle(); + + SerializedWrapper reply; + env->perform_request(&request, &reply); + ham_assert(reply.id == kCursorGetRecordSizeReply); + + ham_status_t st = reply.cursor_get_record_size_reply.status; + if (st == 0) + *psize = reply.cursor_get_record_size_reply.size; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::cursor_overwrite(Cursor *cursor, + ham_record_t *record, uint32_t flags) +{ + try { + RemoteEnvironment *env = renv(); + + SerializedWrapper request; + request.id = kCursorOverwriteRequest; + request.cursor_overwrite_request.cursor_handle = cursor->get_remote_handle(); + request.cursor_overwrite_request.flags = flags; + + if (record->size > 0) { + request.cursor_overwrite_request.record.has_data = true; + request.cursor_overwrite_request.record.data.size = record->size; + request.cursor_overwrite_request.record.data.value = (uint8_t *)record->data; + } + request.cursor_overwrite_request.record.flags = record->flags; + request.cursor_overwrite_request.record.partial_size = record->partial_size; + request.cursor_overwrite_request.record.partial_offset = record->partial_offset; + + SerializedWrapper reply; + env->perform_request(&request, &reply); + ham_assert(reply.id == kCursorOverwriteReply); + + return (reply.cursor_overwrite_reply.status); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +RemoteDatabase::cursor_move(Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + try { + RemoteEnvironment *env = renv(); + + RemoteTransaction *txn = dynamic_cast<RemoteTransaction *>(cursor->get_txn()); + ByteArray *pkey_arena = &key_arena(txn); + ByteArray *prec_arena = &record_arena(txn); + + Protocol request(Protocol::CURSOR_MOVE_REQUEST); + request.mutable_cursor_move_request()->set_cursor_handle(cursor->get_remote_handle()); + request.mutable_cursor_move_request()->set_flags(flags); + if (key) + Protocol::assign_key(request.mutable_cursor_move_request()->mutable_key(), + key, false); + if (record) + Protocol::assign_record(request.mutable_cursor_move_request()->mutable_record(), + record, false); + + ScopedPtr<Protocol> reply(env->perform_request(&request)); + + ham_assert(reply->has_cursor_move_reply() != 0); + + ham_status_t st = reply->cursor_move_reply().status(); + if (st) + return (st); + + /* modify key/record, but make sure that USER_ALLOC is respected! */ + if (reply->cursor_move_reply().has_key()) { + ham_assert(key); + key->_flags = reply->cursor_move_reply().key().intflags(); + key->size = (uint16_t)reply->cursor_move_reply().key().data().size(); + if (!(key->flags & HAM_KEY_USER_ALLOC)) { + pkey_arena->resize(key->size); + key->data = pkey_arena->get_ptr(); + } + memcpy(key->data, (void *)&reply->cursor_move_reply().key().data()[0], + key->size); + } + + /* same for the record */ + if (reply->cursor_move_reply().has_record()) { + ham_assert(record); + record->size = reply->cursor_move_reply().record().data().size(); + if (!(record->flags & HAM_RECORD_USER_ALLOC)) { + prec_arena->resize(record->size); + record->data = prec_arena->get_ptr(); + } + memcpy(record->data, (void *)&reply->cursor_move_reply().record().data()[0], + record->size); + } + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +void +RemoteDatabase::cursor_close_impl(Cursor *cursor) +{ + SerializedWrapper request; + request.id = kCursorCloseRequest; + request.cursor_close_request.cursor_handle = cursor->get_remote_handle(); + + SerializedWrapper reply; + renv()->perform_request(&request, &reply); + ham_assert(reply.id == kCursorCloseReply); +} + +ham_status_t +RemoteDatabase::close_impl(uint32_t flags) +{ + RemoteEnvironment *env = renv(); + + // do not set HAM_DONT_LOCK over the network + flags &= ~HAM_DONT_LOCK; + + Protocol request(Protocol::DB_CLOSE_REQUEST); + request.mutable_db_close_request()->set_db_handle(m_remote_handle); + request.mutable_db_close_request()->set_flags(flags); + + ScopedPtr<Protocol> reply(env->perform_request(&request)); + + ham_assert(reply->has_db_close_reply()); + + ham_status_t st = reply->db_close_reply().status(); + if (st == 0) + m_remote_handle = 0; + + return (st); +} + + +} // namespace hamsterdb + +#endif // HAM_ENABLE_REMOTE + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4db/db_remote.h b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_remote.h new file mode 100644 index 0000000000..1a492418bc --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4db/db_remote.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_DB_REMOTE_H +#define HAM_DB_REMOTE_H + +#ifdef HAM_ENABLE_REMOTE + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "4db/db.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class Environment; +class RemoteEnvironment; + +/* + * The database implementation for remote file access + */ +class RemoteDatabase : public Database +{ + public: + RemoteDatabase(Environment *env, DatabaseConfiguration config, + uint64_t remote_handle) + : Database(env, config), m_remote_handle(remote_handle) { + } + + // Fills in the current metrics + virtual void fill_metrics(ham_env_metrics_t *metrics) { } + + // Returns Database parameters (ham_db_get_parameters) + virtual ham_status_t get_parameters(ham_parameter_t *param); + + // Checks Database integrity (ham_db_check_integrity) + virtual ham_status_t check_integrity(uint32_t flags); + + // Returns the number of keys + virtual ham_status_t count(Transaction *txn, bool distinct, + uint64_t *pcount); + + // Scans the whole database, applies a processor function + virtual ham_status_t scan(Transaction *txn, ScanVisitor *visitor, + bool distinct) { + return (HAM_NOT_IMPLEMENTED); + } + + // Inserts a key/value pair (ham_db_insert, ham_cursor_insert) + virtual ham_status_t insert(Cursor *cursor, Transaction *txn, + ham_key_t *key, ham_record_t *record, uint32_t flags); + + // Erase a key/value pair (ham_db_erase, ham_cursor_erase) + virtual ham_status_t erase(Cursor *cursor, Transaction *txn, ham_key_t *key, + uint32_t flags); + + // Lookup of a key/value pair (ham_db_find, ham_cursor_find) + virtual ham_status_t find(Cursor *cursor, Transaction *txn, ham_key_t *key, + ham_record_t *record, uint32_t flags); + + // Returns number of duplicates (ham_cursor_get_record_count) + virtual ham_status_t cursor_get_record_count(Cursor *cursor, uint32_t flags, + uint32_t *pcount); + + // Returns position in duplicate list (ham_cursor_get_duplicate_position) + virtual ham_status_t cursor_get_duplicate_position(Cursor *cursor, + uint32_t *pposition); + + // Get current record size (ham_cursor_get_record_size) + virtual ham_status_t cursor_get_record_size(Cursor *cursor, + uint64_t *psize); + + // Overwrites the record of a cursor (ham_cursor_overwrite) + virtual ham_status_t cursor_overwrite(Cursor *cursor, + ham_record_t *record, uint32_t flags); + + // Moves a cursor, returns key and/or record (ham_cursor_move) + virtual ham_status_t cursor_move(Cursor *cursor, ham_key_t *key, + ham_record_t *record, uint32_t flags); + + protected: + // Creates a cursor; this is the actual implementation + virtual Cursor *cursor_create_impl(Transaction *txn, uint32_t flags); + + // Clones a cursor; this is the actual implementation + virtual Cursor *cursor_clone_impl(Cursor *src); + + // Closes a cursor; this is the actual implementation + virtual void cursor_close_impl(Cursor *c); + + // Closes a database; this is the actual implementation + virtual ham_status_t close_impl(uint32_t flags); + + private: + // Returns the RemoteEnvironment instance + RemoteEnvironment *renv() { + return ((RemoteEnvironment *)m_env); + } + + // the remote database handle + uint64_t m_remote_handle; +}; + +} // namespace hamsterdb + +#endif /* HAM_ENABLE_REMOTE */ + +#endif /* HAM_DB_REMOTE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env.cc b/plugins/Dbx_kv/src/hamsterdb/src/4env/env.cc new file mode 100644 index 0000000000..6e3a494f6d --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env.cc @@ -0,0 +1,333 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "4db/db.h" +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +namespace hamsterdb { + +ham_status_t +Environment::create() +{ + try { + return (do_create()); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::open() +{ + try { + return (do_open()); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::get_database_names(uint16_t *names, uint32_t *count) +{ + try { + ScopedLock lock(m_mutex); + return (do_get_database_names(names, count)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::get_parameters(ham_parameter_t *param) +{ + try { + ScopedLock lock(m_mutex); + return (do_get_parameters(param)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::flush(uint32_t flags) +{ + try { + ScopedLock lock(m_mutex); + return (do_flush(flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::create_db(Database **pdb, DatabaseConfiguration &config, + const ham_parameter_t *param) +{ + try { + ScopedLock lock(m_mutex); + + ham_status_t st = do_create_db(pdb, config, param); + + // on success: store the open database in the environment's list of + // opened databases + if (st == 0) { + m_database_map[config.db_name] = *pdb; + /* flush the environment to make sure that the header page is written + * to disk */ + if (st == 0) + st = do_flush(0); + } + else { + if (*pdb) + (void)ham_db_close((ham_db_t *)*pdb, HAM_DONT_LOCK); + } + return (st); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::open_db(Database **pdb, DatabaseConfiguration &config, + const ham_parameter_t *param) +{ + try { + ScopedLock lock(m_mutex); + + /* make sure that this database is not yet open */ + if (m_database_map.find(config.db_name) != m_database_map.end()) + return (HAM_DATABASE_ALREADY_OPEN); + + ham_status_t st = do_open_db(pdb, config, param); + + // on success: store the open database in the environment's list of + // opened databases + if (st == 0) + m_database_map[config.db_name] = *pdb; + else { + if (*pdb) + (void)ham_db_close((ham_db_t *)*pdb, HAM_DONT_LOCK); + } + return (st); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::rename_db(uint16_t oldname, uint16_t newname, uint32_t flags) +{ + try { + ScopedLock lock(m_mutex); + return (do_rename_db(oldname, newname, flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::erase_db(uint16_t dbname, uint32_t flags) +{ + try { + ScopedLock lock(m_mutex); + return (do_erase_db(dbname, flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::close_db(Database *db, uint32_t flags) +{ + ham_status_t st = 0; + + try { + ScopedLock lock; + if (!(flags & HAM_DONT_LOCK)) + lock = ScopedLock(m_mutex); + + uint16_t dbname = db->name(); + + // flush committed Transactions + st = do_flush(HAM_FLUSH_COMMITTED_TRANSACTIONS); + if (st) + return (st); + + st = db->close(flags); + if (st) + return (st); + + m_database_map.erase(dbname); + delete db; + + /* in-memory database: make sure that a database with the same name + * can be re-created */ + if (m_config.flags & HAM_IN_MEMORY) + do_erase_db(dbname, 0); + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::txn_begin(Transaction **ptxn, const char *name, uint32_t flags) +{ + try { + ScopedLock lock; + if (!(flags & HAM_DONT_LOCK)) + lock = ScopedLock(m_mutex); + + if (!(m_config.flags & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("transactions are disabled (see HAM_ENABLE_TRANSACTIONS)")); + return (HAM_INV_PARAMETER); + } + + *ptxn = do_txn_begin(name, flags); + return (0); + } + catch (Exception &ex) { + *ptxn = 0; + return (ex.code); + } +} + +std::string +Environment::txn_get_name(Transaction *txn) +{ + try { + ScopedLock lock(m_mutex); + return (txn->get_name()); + } + catch (Exception &) { + return (""); + } +} + +ham_status_t +Environment::txn_commit(Transaction *txn, uint32_t flags) +{ + try { + ScopedLock lock(m_mutex); + return (do_txn_commit(txn, flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::txn_abort(Transaction *txn, uint32_t flags) +{ + try { + ScopedLock lock(m_mutex); + return (do_txn_abort(txn, flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::close(uint32_t flags) +{ + ham_status_t st = 0; + + try { + ScopedLock lock(m_mutex); + + /* auto-abort (or commit) all pending transactions */ + if (m_txn_manager.get()) { + Transaction *t; + + while ((t = m_txn_manager->get_oldest_txn())) { + if (!t->is_aborted() && !t->is_committed()) { + if (flags & HAM_TXN_AUTO_COMMIT) + st = m_txn_manager->commit(t, 0); + else /* if (flags & HAM_TXN_AUTO_ABORT) */ + st = m_txn_manager->abort(t, 0); + if (st) + return (st); + } + + m_txn_manager->flush_committed_txns(); + } + } + + /* flush all remaining transactions */ + if (m_txn_manager) + m_txn_manager->flush_committed_txns(); + + /* close all databases */ + Environment::DatabaseMap::iterator it = m_database_map.begin(); + while (it != m_database_map.end()) { + Environment::DatabaseMap::iterator it2 = it; it++; + Database *db = it2->second; + if (flags & HAM_AUTO_CLEANUP) + st = close_db(db, flags | HAM_DONT_LOCK); + else + st = db->close(flags); + if (st) + return (st); + } + m_database_map.clear(); + + return (do_close(flags)); + } + catch (Exception &ex) { + return (ex.code); + } +} + +ham_status_t +Environment::fill_metrics(ham_env_metrics_t *metrics) +{ + try { + ScopedLock lock(m_mutex); + do_fill_metrics(metrics); + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +EnvironmentTest +Environment::test() +{ + return (EnvironmentTest(m_config)); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env.h b/plugins/Dbx_kv/src/hamsterdb/src/4env/env.h new file mode 100644 index 0000000000..c0841151df --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env.h @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: nothrow + * @thread_safe: yes + */ + +#ifndef HAM_ENV_H +#define HAM_ENV_H + +#include "0root/root.h" + +#include <map> +#include <string> + +#include "ham/hamsterdb_int.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/mutex.h" +#include "1base/scoped_ptr.h" +#include "2config/db_config.h" +#include "2config/env_config.h" +#include "4txn/txn.h" +#include "4env/env_test.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +// A helper structure; ham_env_t is declared in ham/hamsterdb.h as an +// opaque C structure, but internally we use a C++ class. The ham_env_t +// struct satisfies the C compiler, and internally we just cast the pointers. +struct ham_env_t { + int dummy; +}; + +namespace hamsterdb { + +class Database; +class Transaction; + +// +// The Environment is the "root" of all hamsterdb objects. It's a container +// for multiple databases and transactions. +// +// This class provides exception handling and locking mechanisms, then +// dispatches all calls to LocalEnvironment or RemoteEnvironment. +// +class Environment +{ + public: + // Constructor + Environment(EnvironmentConfiguration &config) + : m_config(config) { + } + + virtual ~Environment() { + } + + // Returns the flags which were set when creating/opening the Environment + uint32_t get_flags() const { + return (m_config.flags); + } + + // Returns the Environment's configuration + const EnvironmentConfiguration &config() const { + return (m_config); + } + + // Returns this Environment's mutex + Mutex &mutex() { + return (m_mutex); + } + + // Creates a new Environment (ham_env_create) + ham_status_t create(); + + // Opens a new Environment (ham_env_open) + ham_status_t open(); + + // Returns all database names (ham_env_get_database_names) + ham_status_t get_database_names(uint16_t *names, uint32_t *count); + + // Returns environment parameters and flags (ham_env_get_parameters) + ham_status_t get_parameters(ham_parameter_t *param); + + // Flushes the environment and its databases to disk (ham_env_flush) + ham_status_t flush(uint32_t flags); + + // Creates a new database in the environment (ham_env_create_db) + ham_status_t create_db(Database **db, DatabaseConfiguration &config, + const ham_parameter_t *param); + + // Opens an existing database in the environment (ham_env_open_db) + ham_status_t open_db(Database **db, DatabaseConfiguration &config, + const ham_parameter_t *param); + + // Renames a database in the Environment (ham_env_rename_db) + ham_status_t rename_db(uint16_t oldname, uint16_t newname, uint32_t flags); + + // Erases (deletes) a database from the Environment (ham_env_erase_db) + ham_status_t erase_db(uint16_t name, uint32_t flags); + + // Closes an existing database in the environment (ham_db_close) + ham_status_t close_db(Database *db, uint32_t flags); + + // Begins a new transaction (ham_txn_begin) + ham_status_t txn_begin(Transaction **ptxn, const char *name, + uint32_t flags); + + // Returns the name of a Transaction + std::string txn_get_name(Transaction *txn); + + // Commits a transaction (ham_txn_commit) + ham_status_t txn_commit(Transaction *txn, uint32_t flags); + + // Commits a transaction (ham_txn_abort) + ham_status_t txn_abort(Transaction *txn, uint32_t flags); + + // Closes the Environment (ham_env_close) + ham_status_t close(uint32_t flags); + + // Fills in the current metrics + ham_status_t fill_metrics(ham_env_metrics_t *metrics); + + // Returns a test object + EnvironmentTest test(); + + protected: + // Creates a new Environment (ham_env_create) + virtual ham_status_t do_create() = 0; + + // Opens a new Environment (ham_env_open) + virtual ham_status_t do_open() = 0; + + // Returns all database names (ham_env_get_database_names) + virtual ham_status_t do_get_database_names(uint16_t *names, + uint32_t *count) = 0; + + // Returns environment parameters and flags (ham_env_get_parameters) + virtual ham_status_t do_get_parameters(ham_parameter_t *param) = 0; + + // Flushes the environment and its databases to disk (ham_env_flush) + virtual ham_status_t do_flush(uint32_t flags) = 0; + + // Creates a new database in the environment (ham_env_create_db) + virtual ham_status_t do_create_db(Database **db, + DatabaseConfiguration &config, + const ham_parameter_t *param) = 0; + + // Opens an existing database in the environment (ham_env_open_db) + virtual ham_status_t do_open_db(Database **db, + DatabaseConfiguration &config, + const ham_parameter_t *param) = 0; + + // Renames a database in the Environment (ham_env_rename_db) + virtual ham_status_t do_rename_db(uint16_t oldname, uint16_t newname, + uint32_t flags) = 0; + + // Erases (deletes) a database from the Environment (ham_env_erase_db) + virtual ham_status_t do_erase_db(uint16_t name, uint32_t flags) = 0; + + // Begins a new transaction (ham_txn_begin) + virtual Transaction *do_txn_begin(const char *name, uint32_t flags) = 0; + + // Commits a transaction (ham_txn_commit) + virtual ham_status_t do_txn_commit(Transaction *txn, uint32_t flags) = 0; + + // Commits a transaction (ham_txn_abort) + virtual ham_status_t do_txn_abort(Transaction *txn, uint32_t flags) = 0; + + // Closes the Environment (ham_env_close) + virtual ham_status_t do_close(uint32_t flags) = 0; + + // Fills in the current metrics + virtual void do_fill_metrics(ham_env_metrics_t *metrics) const = 0; + + protected: + // A mutex to serialize access to this Environment + Mutex m_mutex; + + // The Environment's configuration + EnvironmentConfiguration m_config; + + // The Transaction manager; can be 0 + ScopedPtr<TransactionManager> m_txn_manager; + + // A map of all opened Databases + typedef std::map<uint16_t, Database *> DatabaseMap; + DatabaseMap m_database_map; +}; + +} // namespace hamsterdb + +#endif /* HAM_ENV_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_header.h b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_header.h new file mode 100644 index 0000000000..56c5a5fcb8 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_header.h @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_ENV_HEADER_H +#define HAM_ENV_HEADER_H + +#include "0root/root.h" + +#include <map> +#include <string> + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "2page/page.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +#include "1base/packstart.h" + +/** + * the persistent file header + */ +typedef HAM_PACK_0 struct HAM_PACK_1 +{ + /** magic cookie - always "ham\0" */ + uint8_t _magic[4]; + + /** version information - major, minor, rev, file */ + uint8_t _version[4]; + + /** reserved */ + uint64_t _reserved1; + + /** size of the page */ + uint32_t _page_size; + + /** maximum number of databases for this environment */ + uint16_t _max_databases; + + /** PRO: for storing journal compression algorithm */ + uint8_t _journal_compression; + + /** reserved */ + uint8_t _reserved3; + + /** blob id of the PageManager's state */ + uint64_t _page_manager_blobid; + + /* + * following here: + * + * 1. the private data of the index btree(s) + * -> see get_btree_header() + */ +} HAM_PACK_2 PEnvironmentHeader; + +#include "1base/packstop.h" + +class EnvironmentHeader +{ + public: + // Constructor + EnvironmentHeader(Page *page) + : m_header_page(page) { + } + + // Sets the 'magic' field of a file header + void set_magic(uint8_t m1, uint8_t m2, uint8_t m3, uint8_t m4) { + get_header()->_magic[0] = m1; + get_header()->_magic[1] = m2; + get_header()->_magic[2] = m3; + get_header()->_magic[3] = m4; + } + + // Returns true if the magic matches + bool verify_magic(uint8_t m1, uint8_t m2, uint8_t m3, uint8_t m4) { + if (get_header()->_magic[0] != m1) + return (false); + if (get_header()->_magic[1] != m2) + return (false); + if (get_header()->_magic[2] != m3) + return (false); + if (get_header()->_magic[3] != m4) + return (false); + return (true); + } + + // Returns byte |i| of the 'version'-header + uint8_t get_version(int i) { + return (get_header()->_version[i]); + } + + // Sets the version of a file header + void set_version(uint8_t major, uint8_t minor, uint8_t revision, + uint8_t file) { + get_header()->_version[0] = major; + get_header()->_version[1] = minor; + get_header()->_version[2] = revision; + get_header()->_version[3] = file; + } + + // Returns get the maximum number of databases for this file + uint16_t get_max_databases() { + return (get_header()->_max_databases); + } + + // Sets the maximum number of databases for this file + void set_max_databases(uint16_t max_databases) { + get_header()->_max_databases = max_databases; + } + + // Returns the page size from the header page + uint32_t page_size() { + return (get_header()->_page_size); + } + + // Sets the page size in the header page + void set_page_size(uint32_t page_size) { + get_header()->_page_size = page_size; + } + + // Returns the PageManager's blob id + uint64_t get_page_manager_blobid() { + return (get_header()->_page_manager_blobid); + } + + // Sets the page size in the header page + void set_page_manager_blobid(uint64_t blobid) { + get_header()->_page_manager_blobid = blobid; + } + + // Returns the Journal compression configuration + int get_journal_compression(int *level) { + *level = get_header()->_journal_compression & 0x0f; + return (get_header()->_journal_compression >> 4); + } + + // Sets the Journal compression configuration + void set_journal_compression(int algorithm, int level) { + get_header()->_journal_compression = (algorithm << 4) | level; + } + + // Returns the header page with persistent configuration settings + Page *get_header_page() { + return (m_header_page); + } + + private: + // Returns a pointer to the header data + PEnvironmentHeader *get_header() { + return ((PEnvironmentHeader *)(m_header_page->get_payload())); + } + + // The header page of the Environment + Page *m_header_page; +}; + +} // namespace hamsterdb + +#endif /* HAM_ENV_HEADER_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local.cc b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local.cc new file mode 100644 index 0000000000..7ba0280d7a --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local.cc @@ -0,0 +1,760 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/os.h" +#include "2device/device_factory.h" +#include "3btree/btree_index.h" +#include "3btree/btree_stats.h" +#include "3blob_manager/blob_manager_factory.h" +#include "3journal/journal.h" +#include "3page_manager/page_manager.h" +#include "4db/db.h" +#include "4txn/txn.h" +#include "4txn/txn_local.h" +#include "4env/env_local.h" +#include "4cursor/cursor.h" +#include "4context/context.h" +#include "4txn/txn_cursor.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +namespace hamsterdb { + +LocalEnvironment::LocalEnvironment(EnvironmentConfiguration &config) + : Environment(config) +{ +} + +void +LocalEnvironment::recover(uint32_t flags) +{ + Context context(this); + + ham_status_t st = 0; + m_journal.reset(new Journal(this)); + + ham_assert(get_flags() & HAM_ENABLE_RECOVERY); + + try { + m_journal->open(); + } + catch (Exception &ex) { + if (ex.code == HAM_FILE_NOT_FOUND) { + m_journal->create(); + return; + } + } + + /* success - check if we need recovery */ + if (!m_journal->is_empty()) { + if (flags & HAM_AUTO_RECOVERY) { + m_journal->recover((LocalTransactionManager *)m_txn_manager.get()); + } + else { + st = HAM_NEED_RECOVERY; + goto bail; + } + } + +bail: + /* in case of errors: close log and journal, but do not delete the files */ + if (st) { + m_journal->close(true); + throw Exception(st); + } + + /* reset the page manager */ + m_page_manager->reset(&context); +} + +PBtreeHeader * +LocalEnvironment::btree_header(int i) +{ + PBtreeHeader *d = (PBtreeHeader *) + (m_header->get_header_page()->get_payload() + + sizeof(PEnvironmentHeader)); + return (d + i); +} + +LocalEnvironmentTest +LocalEnvironment::test() +{ + return (LocalEnvironmentTest(this)); +} + +ham_status_t +LocalEnvironment::do_create() +{ + if (m_config.flags & HAM_IN_MEMORY) + m_config.flags |= HAM_DISABLE_RECLAIM_INTERNAL; + + /* initialize the device if it does not yet exist */ + m_blob_manager.reset(BlobManagerFactory::create(this, m_config.flags)); + m_device.reset(DeviceFactory::create(m_config)); + if (m_config.flags & HAM_ENABLE_TRANSACTIONS) + m_txn_manager.reset(new LocalTransactionManager(this)); + + /* create the file */ + m_device->create(); + + /* allocate the header page */ + Page *page = new Page(m_device.get()); + page->alloc(Page::kTypeHeader, m_config.page_size_bytes); + ::memset(page->get_data(), 0, m_config.page_size_bytes); + page->set_type(Page::kTypeHeader); + page->set_dirty(true); + + m_header.reset(new EnvironmentHeader(page)); + + /* initialize the header */ + m_header->set_magic('H', 'A', 'M', '\0'); + m_header->set_version(HAM_VERSION_MAJ, HAM_VERSION_MIN, HAM_VERSION_REV, + HAM_FILE_VERSION); + m_header->set_page_size(m_config.page_size_bytes); + m_header->set_max_databases(m_config.max_databases); + + /* load page manager after setting up the blobmanager and the device! */ + m_page_manager.reset(new PageManager(this)); + + /* create a logfile and a journal (if requested) */ + if (get_flags() & HAM_ENABLE_RECOVERY) { + m_journal.reset(new Journal(this)); + m_journal->create(); + } + + /* flush the header page - this will write through disk if logging is + * enabled */ + if (get_flags() & HAM_ENABLE_RECOVERY) + m_header->get_header_page()->flush(); + + return (0); +} + +ham_status_t +LocalEnvironment::do_open() +{ + ham_status_t st = 0; + + Context context(this); + + /* Initialize the device if it does not yet exist. The page size will + * be filled in later (at this point in time, it's still unknown) */ + m_blob_manager.reset(BlobManagerFactory::create(this, m_config.flags)); + m_device.reset(DeviceFactory::create(m_config)); + + /* open the file */ + m_device->open(); + + if (m_config.flags & HAM_ENABLE_TRANSACTIONS) + m_txn_manager.reset(new LocalTransactionManager(this)); + + /* + * read the database header + * + * !!! + * now this is an ugly problem - the database header spans one page, but + * what's the size of this page? chances are good that it's the default + * page-size, but we really can't be sure. + * + * read 512 byte and extract the "real" page size, then read + * the real page. + */ + { + Page *page = 0; + uint8_t hdrbuf[512]; + + /* + * in here, we're going to set up a faked headerpage for the + * duration of this call; BE VERY CAREFUL: we MUST clean up + * at the end of this section or we'll be in BIG trouble! + */ + Page fakepage(m_device.get()); + fakepage.set_data((PPageData *)hdrbuf); + + /* create the configuration object */ + m_header.reset(new EnvironmentHeader(&fakepage)); + + /* + * now fetch the header data we need to get an estimate of what + * the database is made of really. + */ + m_device->read(0, hdrbuf, sizeof(hdrbuf)); + + m_config.page_size_bytes = m_header->page_size(); + + /** check the file magic */ + if (!m_header->verify_magic('H', 'A', 'M', '\0')) { + ham_log(("invalid file type")); + st = HAM_INV_FILE_HEADER; + goto fail_with_fake_cleansing; + } + + /* check the database version; everything with a different file version + * is incompatible */ + if (m_header->get_version(3) != HAM_FILE_VERSION) { + ham_log(("invalid file version")); + st = HAM_INV_FILE_VERSION; + goto fail_with_fake_cleansing; + } + else if (m_header->get_version(0) == 1 && + m_header->get_version(1) == 0 && + m_header->get_version(2) <= 9) { + ham_log(("invalid file version; < 1.0.9 is not supported")); + st = HAM_INV_FILE_VERSION; + goto fail_with_fake_cleansing; + } + + st = 0; + +fail_with_fake_cleansing: + + /* undo the headerpage fake first! */ + fakepage.set_data(0); + m_header.reset(0); + + /* exit when an error was signaled */ + if (st) { + if (m_device->is_open()) + m_device->close(); + return (st); + } + + /* now read the "real" header page and store it in the Environment */ + page = new Page(m_device.get()); + page->fetch(0); + m_header.reset(new EnvironmentHeader(page)); + } + + /* load page manager after setting up the blobmanager and the device! */ + m_page_manager.reset(new PageManager(this)); + + /* check if recovery is required */ + if (get_flags() & HAM_ENABLE_RECOVERY) + recover(m_config.flags); + + /* load the state of the PageManager */ + if (m_header->get_page_manager_blobid() != 0) + m_page_manager->initialize(m_header->get_page_manager_blobid()); + + return (0); +} + +ham_status_t +LocalEnvironment::do_get_database_names(uint16_t *names, uint32_t *count) +{ + uint16_t name; + uint32_t i = 0; + uint32_t max_names = 0; + + max_names = *count; + *count = 0; + + /* copy each database name to the array */ + ham_assert(m_header->get_max_databases() > 0); + for (i = 0; i < m_header->get_max_databases(); i++) { + name = btree_header(i)->get_dbname(); + if (name == 0) + continue; + + if (*count >= max_names) + return (HAM_LIMITS_REACHED); + + names[(*count)++] = name; + } + + return 0; +} + +ham_status_t +LocalEnvironment::do_get_parameters(ham_parameter_t *param) +{ + ham_parameter_t *p = param; + + if (p) { + for (; p->name; p++) { + switch (p->name) { + case HAM_PARAM_CACHE_SIZE: + p->value = m_config.cache_size_bytes; + break; + case HAM_PARAM_PAGE_SIZE: + p->value = m_config.page_size_bytes; + break; + case HAM_PARAM_MAX_DATABASES: + p->value = m_header->get_max_databases(); + break; + case HAM_PARAM_FLAGS: + p->value = get_flags(); + break; + case HAM_PARAM_FILEMODE: + p->value = m_config.file_mode; + break; + case HAM_PARAM_FILENAME: + if (m_config.filename.size()) + p->value = (uint64_t)(PTR_TO_U64(m_config.filename.c_str())); + else + p->value = 0; + break; + case HAM_PARAM_LOG_DIRECTORY: + if (m_config.log_filename.size()) + p->value = (uint64_t)(PTR_TO_U64(m_config.log_filename.c_str())); + else + p->value = 0; + break; + case HAM_PARAM_JOURNAL_SWITCH_THRESHOLD: + p->value = m_config.journal_switch_threshold; + break; + case HAM_PARAM_JOURNAL_COMPRESSION: + p->value = 0; + break; + case HAM_PARAM_POSIX_FADVISE: + p->value = m_config.posix_advice; + break; + default: + ham_trace(("unknown parameter %d", (int)p->name)); + return (HAM_INV_PARAMETER); + } + } + } + + return (0); +} + +ham_status_t +LocalEnvironment::do_flush(uint32_t flags) +{ + Context context(this, 0, 0); + + /* flush all committed transactions */ + if (m_txn_manager) + m_txn_manager->flush_committed_txns(&context); + + if (flags & HAM_FLUSH_COMMITTED_TRANSACTIONS || get_flags() & HAM_IN_MEMORY) + return (0); + + /* flush the header page */ + m_header->get_header_page()->flush(); + + /* flush all open pages to disk */ + m_page_manager->flush(false); + + /* flush the device - this usually causes a fsync() */ + m_device->flush(); + return (0); +} + +ham_status_t +LocalEnvironment::do_create_db(Database **pdb, DatabaseConfiguration &config, + const ham_parameter_t *param) +{ + if (get_flags() & HAM_READ_ONLY) { + ham_trace(("cannot create database in a read-only environment")); + return (HAM_WRITE_PROTECTED); + } + + if (param) { + for (; param->name; param++) { + switch (param->name) { + case HAM_PARAM_RECORD_COMPRESSION: + ham_trace(("Record compression is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_KEY_COMPRESSION: + ham_trace(("Key compression is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_KEY_TYPE: + config.key_type = (uint16_t)param->value; + break; + case HAM_PARAM_KEY_SIZE: + if (param->value != 0) { + if (param->value > 0xffff) { + ham_trace(("invalid key size %u - must be < 0xffff")); + return (HAM_INV_KEY_SIZE); + } + if (config.flags & HAM_RECORD_NUMBER32) { + if (param->value > 0 && param->value != sizeof(uint32_t)) { + ham_trace(("invalid key size %u - must be 4 for " + "HAM_RECORD_NUMBER32 databases", + (unsigned)param->value)); + return (HAM_INV_KEY_SIZE); + } + } + if (config.flags & HAM_RECORD_NUMBER64) { + if (param->value > 0 && param->value != sizeof(uint64_t)) { + ham_trace(("invalid key size %u - must be 8 for " + "HAM_RECORD_NUMBER64 databases", + (unsigned)param->value)); + return (HAM_INV_KEY_SIZE); + } + } + config.key_size = (uint16_t)param->value; + } + break; + case HAM_PARAM_RECORD_SIZE: + config.record_size = (uint32_t)param->value; + break; + default: + ham_trace(("invalid parameter 0x%x (%d)", param->name, param->name)); + return (HAM_INV_PARAMETER); + } + } + } + + if (config.flags & HAM_RECORD_NUMBER32) { + if (config.key_type == HAM_TYPE_UINT8 + || config.key_type == HAM_TYPE_UINT16 + || config.key_type == HAM_TYPE_UINT64 + || config.key_type == HAM_TYPE_REAL32 + || config.key_type == HAM_TYPE_REAL64) { + ham_trace(("HAM_RECORD_NUMBER32 not allowed in combination with " + "fixed length type")); + return (HAM_INV_PARAMETER); + } + config.key_type = HAM_TYPE_UINT32; + } + else if (config.flags & HAM_RECORD_NUMBER64) { + if (config.key_type == HAM_TYPE_UINT8 + || config.key_type == HAM_TYPE_UINT16 + || config.key_type == HAM_TYPE_UINT32 + || config.key_type == HAM_TYPE_REAL32 + || config.key_type == HAM_TYPE_REAL64) { + ham_trace(("HAM_RECORD_NUMBER64 not allowed in combination with " + "fixed length type")); + return (HAM_INV_PARAMETER); + } + config.key_type = HAM_TYPE_UINT64; + } + + uint32_t mask = HAM_FORCE_RECORDS_INLINE + | HAM_FLUSH_WHEN_COMMITTED + | HAM_ENABLE_DUPLICATE_KEYS + | HAM_RECORD_NUMBER32 + | HAM_RECORD_NUMBER64; + if (config.flags & ~mask) { + ham_trace(("invalid flags(s) 0x%x", config.flags & ~mask)); + return (HAM_INV_PARAMETER); + } + + /* create a new Database object */ + LocalDatabase *db = new LocalDatabase(this, config); + + Context context(this, 0, db); + + /* check if this database name is unique */ + uint16_t dbi; + for (uint32_t i = 0; i < m_header->get_max_databases(); i++) { + uint16_t name = btree_header(i)->get_dbname(); + if (!name) + continue; + if (name == config.db_name) { + delete db; + return (HAM_DATABASE_ALREADY_EXISTS); + } + } + + /* find a free slot in the PBtreeHeader array and store the name */ + for (dbi = 0; dbi < m_header->get_max_databases(); dbi++) { + uint16_t name = btree_header(dbi)->get_dbname(); + if (!name) { + btree_header(dbi)->set_dbname(config.db_name); + break; + } + } + if (dbi == m_header->get_max_databases()) { + delete db; + return (HAM_LIMITS_REACHED); + } + + mark_header_page_dirty(&context); + + /* initialize the Database */ + ham_status_t st = db->create(&context, btree_header(dbi)); + if (st) { + delete db; + return (st); + } + + /* force-flush the changeset */ + if (get_flags() & HAM_ENABLE_RECOVERY) + context.changeset.flush(next_lsn()); + + *pdb = db; + return (0); +} + +ham_status_t +LocalEnvironment::do_open_db(Database **pdb, DatabaseConfiguration &config, + const ham_parameter_t *param) +{ + *pdb = 0; + + uint32_t mask = HAM_FORCE_RECORDS_INLINE + | HAM_FLUSH_WHEN_COMMITTED + | HAM_READ_ONLY; + if (config.flags & ~mask) { + ham_trace(("invalid flags(s) 0x%x", config.flags & ~mask)); + return (HAM_INV_PARAMETER); + } + + if (param) { + for (; param->name; param++) { + switch (param->name) { + case HAM_PARAM_RECORD_COMPRESSION: + ham_trace(("Record compression is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_KEY_COMPRESSION: + ham_trace(("Key compression is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + default: + ham_trace(("invalid parameter 0x%x (%d)", param->name, param->name)); + return (HAM_INV_PARAMETER); + } + } + } + + /* create a new Database object */ + LocalDatabase *db = new LocalDatabase(this, config); + + Context context(this, 0, db); + + ham_assert(0 != m_header->get_header_page()); + + /* search for a database with this name */ + uint16_t dbi; + for (dbi = 0; dbi < m_header->get_max_databases(); dbi++) { + uint16_t name = btree_header(dbi)->get_dbname(); + if (!name) + continue; + if (config.db_name == name) + break; + } + + if (dbi == m_header->get_max_databases()) { + delete db; + return (HAM_DATABASE_NOT_FOUND); + } + + /* open the database */ + ham_status_t st = db->open(&context, btree_header(dbi)); + if (st) { + delete db; + ham_trace(("Database could not be opened")); + return (st); + } + + *pdb = db; + return (0); +} + +ham_status_t +LocalEnvironment::do_rename_db(uint16_t oldname, uint16_t newname, + uint32_t flags) +{ + Context context(this); + + /* + * check if a database with the new name already exists; also search + * for the database with the old name + */ + uint16_t max = m_header->get_max_databases(); + uint16_t slot = max; + ham_assert(max > 0); + for (uint16_t dbi = 0; dbi < max; dbi++) { + uint16_t name = btree_header(dbi)->get_dbname(); + if (name == newname) + return (HAM_DATABASE_ALREADY_EXISTS); + if (name == oldname) + slot = dbi; + } + + if (slot == max) + return (HAM_DATABASE_NOT_FOUND); + + /* replace the database name with the new name */ + btree_header(slot)->set_dbname(newname); + mark_header_page_dirty(&context); + + /* if the database with the old name is currently open: notify it */ + Environment::DatabaseMap::iterator it = m_database_map.find(oldname); + if (it != m_database_map.end()) { + Database *db = it->second; + it->second->set_name(newname); + m_database_map.erase(oldname); + m_database_map.insert(DatabaseMap::value_type(newname, db)); + } + + return (0); +} + +ham_status_t +LocalEnvironment::do_erase_db(uint16_t name, uint32_t flags) +{ + /* check if this database is still open */ + if (m_database_map.find(name) != m_database_map.end()) + return (HAM_DATABASE_ALREADY_OPEN); + + /* + * if it's an in-memory environment then it's enough to purge the + * database from the environment header + */ + if (get_flags() & HAM_IN_MEMORY) { + for (uint16_t dbi = 0; dbi < m_header->get_max_databases(); dbi++) { + PBtreeHeader *desc = btree_header(dbi); + if (name == desc->get_dbname()) { + desc->set_dbname(0); + return (0); + } + } + return (HAM_DATABASE_NOT_FOUND); + } + + /* temporarily load the database */ + LocalDatabase *db; + DatabaseConfiguration config; + config.db_name = name; + ham_status_t st = do_open_db((Database **)&db, config, 0); + if (st) + return (st); + + Context context(this, 0, db); + + /* + * delete all blobs and extended keys, also from the cache and + * the extkey-cache + * + * also delete all pages and move them to the freelist; if they're + * cached, delete them from the cache + */ + st = db->drop(&context); + if (st) + return (st); + + /* now set database name to 0 and set the header page to dirty */ + for (uint16_t dbi = 0; dbi < m_header->get_max_databases(); dbi++) { + PBtreeHeader *desc = btree_header(dbi); + if (name == desc->get_dbname()) { + desc->set_dbname(0); + break; + } + } + + mark_header_page_dirty(&context); + context.changeset.clear(); + + (void)ham_db_close((ham_db_t *)db, HAM_DONT_LOCK); + + return (0); +} + +Transaction * +LocalEnvironment::do_txn_begin(const char *name, uint32_t flags) +{ + Transaction *txn = new LocalTransaction(this, name, flags); + m_txn_manager->begin(txn); + return (txn); +} + +ham_status_t +LocalEnvironment::do_txn_commit(Transaction *txn, uint32_t flags) +{ + return (m_txn_manager->commit(txn, flags)); +} + +ham_status_t +LocalEnvironment::do_txn_abort(Transaction *txn, uint32_t flags) +{ + return (m_txn_manager->abort(txn, flags)); +} + +ham_status_t +LocalEnvironment::do_close(uint32_t flags) +{ + Context context(this); + + /* flush all committed transactions */ + if (m_txn_manager) + m_txn_manager->flush_committed_txns(&context); + + /* flush all pages and the freelist, reduce the file size */ + if (m_page_manager) + m_page_manager->close(&context); + + /* if we're not in read-only mode, and not an in-memory-database, + * and the dirty-flag is true: flush the page-header to disk */ + if (m_header && m_header->get_header_page() && !(get_flags() & HAM_IN_MEMORY) + && m_device.get() && m_device.get()->is_open() + && (!(get_flags() & HAM_READ_ONLY))) { + m_header->get_header_page()->flush(); + } + + /* close the header page */ + if (m_header && m_header->get_header_page()) { + Page *page = m_header->get_header_page(); + if (page->get_data()) + m_device->free_page(page); + delete page; + m_header.reset(); + } + + /* close the device */ + if (m_device) { + if (m_device->is_open()) { + if (!(get_flags() & HAM_READ_ONLY)) + m_device->flush(); + m_device->close(); + } + } + + /* close the log and the journal */ + if (m_journal) + m_journal->close(!!(flags & HAM_DONT_CLEAR_LOG)); + + return (0); +} + +void +LocalEnvironment::do_fill_metrics(ham_env_metrics_t *metrics) const +{ + // PageManager metrics (incl. cache and freelist) + m_page_manager->fill_metrics(metrics); + // the BlobManagers + m_blob_manager->fill_metrics(metrics); + // the Journal (if available) + if (m_journal) + m_journal->fill_metrics(metrics); + // the (first) database + if (!m_database_map.empty()) { + LocalDatabase *db = (LocalDatabase *)m_database_map.begin()->second; + db->fill_metrics(metrics); + } + // and of the btrees + BtreeIndex::fill_metrics(metrics); + // SIMD support enabled? + metrics->simd_lane_width = os_get_simd_lane_width(); +} + +void +LocalEnvironmentTest::set_journal(Journal *journal) +{ + m_env->m_journal.reset(journal); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local.h b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local.h new file mode 100644 index 0000000000..7800ee37de --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local.h @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_ENV_LOCAL_H +#define HAM_ENV_LOCAL_H + +#include "ham/hamsterdb.h" + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/scoped_ptr.h" +#include "2lsn_manager/lsn_manager.h" +#include "3journal/journal.h" +#include "4env/env.h" +#include "4env/env_header.h" +#include "4env/env_local_test.h" +#include "4context/context.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class PBtreeHeader; +class PFreelistPayload; +class Journal; +class PageManager; +class BlobManager; +class LocalTransaction; +struct MessageBase; + +// +// The Environment implementation for local file access +// +class LocalEnvironment : public Environment +{ + public: + LocalEnvironment(EnvironmentConfiguration &config); + + // Returns the Device object + Device *device() { + return (m_device.get()); + } + + // Returns the Environment's header object with the persistent configuration + EnvironmentHeader *header() { + return (m_header.get()); + } + + // Returns the blob manager + BlobManager *blob_manager() { + return (m_blob_manager.get()); + } + + // Returns the PageManager instance + PageManager *page_manager() { + return (m_page_manager.get()); + } + + // Returns the Journal + Journal *journal() { + return (m_journal.get()); + } + + // Returns the lsn manager + LsnManager *lsn_manager() { + return (&m_lsn_manager); + } + + // The transaction manager + TransactionManager *txn_manager() { + return (m_txn_manager.get()); + } + + // Increments the lsn and returns the incremented value + uint64_t next_lsn() { + return (m_lsn_manager.next()); + } + + // Returns a test gateway + LocalEnvironmentTest test(); + + protected: + // Creates a new Environment (ham_env_create) + virtual ham_status_t do_create(); + + // Opens a new Environment (ham_env_open) + virtual ham_status_t do_open(); + + // Returns all database names (ham_env_get_database_names) + virtual ham_status_t do_get_database_names(uint16_t *names, + uint32_t *count); + + // Returns environment parameters and flags (ham_env_get_parameters) + virtual ham_status_t do_get_parameters(ham_parameter_t *param); + + // Flushes the environment and its databases to disk (ham_env_flush) + virtual ham_status_t do_flush(uint32_t flags); + + // Creates a new database in the environment (ham_env_create_db) + virtual ham_status_t do_create_db(Database **db, + DatabaseConfiguration &config, + const ham_parameter_t *param); + + // Opens an existing database in the environment (ham_env_open_db) + virtual ham_status_t do_open_db(Database **db, + DatabaseConfiguration &config, + const ham_parameter_t *param); + + // Renames a database in the Environment (ham_env_rename_db) + virtual ham_status_t do_rename_db(uint16_t oldname, uint16_t newname, + uint32_t flags); + + // Erases (deletes) a database from the Environment (ham_env_erase_db) + virtual ham_status_t do_erase_db(uint16_t name, uint32_t flags); + + // Begins a new transaction (ham_txn_begin) + virtual Transaction *do_txn_begin(const char *name, uint32_t flags); + + // Commits a transaction (ham_txn_commit) + virtual ham_status_t do_txn_commit(Transaction *txn, uint32_t flags); + + // Commits a transaction (ham_txn_abort) + virtual ham_status_t do_txn_abort(Transaction *txn, uint32_t flags); + + // Closes the Environment (ham_env_close) + virtual ham_status_t do_close(uint32_t flags); + + // Fills in the current metrics + virtual void do_fill_metrics(ham_env_metrics_t *metrics) const; + + private: + friend class LocalEnvironmentTest; + + // Runs the recovery process + void recover(uint32_t flags); + + // Get the btree configuration of the database #i, where |i| is a + // zero-based index + PBtreeHeader *btree_header(int i); + + // Sets the dirty-flag of the header page and adds the header page + // to the Changeset (if recovery is enabled) + void mark_header_page_dirty(Context *context) { + Page *page = m_header->get_header_page(); + page->set_dirty(true); + if (get_flags() & HAM_ENABLE_RECOVERY) + context->changeset.put(page); + } + + // The Environment's header page/configuration + ScopedPtr<EnvironmentHeader> m_header; + + // The device instance (either a file or an in-memory-db) + ScopedPtr<Device> m_device; + + // The BlobManager instance + ScopedPtr<BlobManager> m_blob_manager; + + // The PageManager instance + ScopedPtr<PageManager> m_page_manager; + + // The logical journal + ScopedPtr<Journal> m_journal; + + // The lsn manager + LsnManager m_lsn_manager; +}; + +} // namespace hamsterdb + +#endif /* HAM_ENV_LOCAL_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local_test.h b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local_test.h new file mode 100644 index 0000000000..ea045e18dc --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_local_test.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: no + * @thread_safe: no + */ + +#ifndef HAM_ENV_LOCAL_TEST_H +#define HAM_ENV_LOCAL_TEST_H + +#include "ham/hamsterdb.h" + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Journal; +class LocalEnvironment; + +class LocalEnvironmentTest +{ + public: + LocalEnvironmentTest(LocalEnvironment *env) + : m_env(env) { + } + + // Sets a new journal object + void set_journal(Journal *journal); + + private: + LocalEnvironment *m_env; +}; + +} // namespace hamsterdb + +#endif /* HAM_ENV_LOCAL_TEST_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_remote.cc b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_remote.cc new file mode 100644 index 0000000000..6e53543c8b --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_remote.cc @@ -0,0 +1,445 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAM_ENABLE_REMOTE + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/os.h" +#include "1base/scoped_ptr.h" +#include "2protobuf/protocol.h" +#include "4cursor/cursor.h" +#include "4db/db_remote.h" +#include "4env/env_remote.h" +#include "4txn/txn_remote.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +RemoteEnvironment::RemoteEnvironment(EnvironmentConfiguration config) + : Environment(config), m_remote_handle(0), m_buffer(1024 * 4) +{ +} + +Protocol * +RemoteEnvironment::perform_request(Protocol *request) +{ + // use ByteArray to avoid frequent reallocs! + m_buffer.clear(); + + if (!request->pack(&m_buffer)) { + ham_log(("protoype Protocol::pack failed")); + throw Exception(HAM_INTERNAL_ERROR); + } + + m_socket.send((uint8_t *)m_buffer.get_ptr(), m_buffer.get_size()); + + // now block and wait for the reply; first read the header, then the + // remaining data + m_socket.recv((uint8_t *)m_buffer.get_ptr(), 8); + + // no need to check the magic; it's verified in Protocol::unpack + uint32_t size = *(uint32_t *)((char *)m_buffer.get_ptr() + 4); + m_buffer.resize(size + 8); + m_socket.recv((uint8_t *)m_buffer.get_ptr() + 8, size); + + return (Protocol::unpack((const uint8_t *)m_buffer.get_ptr(), size + 8)); +} + +void +RemoteEnvironment::perform_request(SerializedWrapper *request, + SerializedWrapper *reply) +{ + int size_left = (int)request->get_size(); + request->size = size_left; + request->magic = HAM_TRANSFER_MAGIC_V2; + m_buffer.resize(request->size); + + uint8_t *ptr = (uint8_t *)m_buffer.get_ptr(); + request->serialize(&ptr, &size_left); + ham_assert(size_left == 0); + + m_socket.send((uint8_t *)m_buffer.get_ptr(), request->size); + + // now block and wait for the reply; first read the header, then the + // remaining data + m_socket.recv((uint8_t *)m_buffer.get_ptr(), 8); + + // now check the magic and receive the remaining data + uint32_t magic = *(uint32_t *)((char *)m_buffer.get_ptr() + 0); + if (magic != HAM_TRANSFER_MAGIC_V2) + throw Exception(HAM_INTERNAL_ERROR); + // TODO check the magic + int size = (int)*(uint32_t *)((char *)m_buffer.get_ptr() + 4); + m_buffer.resize(size); + m_socket.recv((uint8_t *)m_buffer.get_ptr() + 8, size - 8); + + ptr = (uint8_t *)m_buffer.get_ptr(); + reply->deserialize(&ptr, &size); + ham_assert(size == 0); +} + +ham_status_t +RemoteEnvironment::do_create() +{ + // the 'create' operation is identical to 'open' + return (do_open()); +} + +ham_status_t +RemoteEnvironment::do_open() +{ + m_socket.close(); + + const char *url = m_config.filename.c_str(); + + ham_assert(url != 0); + ham_assert(::strstr(url, "ham://") == url); + const char *ip = url + 6; + const char *port_str = strstr(ip, ":"); + if (!port_str) { + ham_trace(("remote uri does not include port - expected " + "`ham://<ip>:<port>`")); + return (HAM_INV_PARAMETER); + } + uint16_t port = (uint16_t)atoi(port_str + 1); + if (!port) { + ham_trace(("remote uri includes invalid port - expected " + "`ham://<ip>:<port>`")); + return (HAM_INV_PARAMETER); + } + + const char *filename = strstr(port_str, "/"); + + std::string hostname(ip, port_str); + m_socket.connect(hostname.c_str(), port, m_config.remote_timeout_sec); + + Protocol request(Protocol::CONNECT_REQUEST); + request.mutable_connect_request()->set_path(filename); + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->type() == Protocol::CONNECT_REPLY); + + ham_status_t st = reply->connect_reply().status(); + if (st == 0) { + m_config.flags |= reply->connect_reply().env_flags(); + m_remote_handle = reply->connect_reply().env_handle(); + + if (get_flags() & HAM_ENABLE_TRANSACTIONS) + m_txn_manager.reset(new RemoteTransactionManager(this)); + } + + return (st); +} + +ham_status_t +RemoteEnvironment::do_get_database_names(uint16_t *names, uint32_t *count) +{ + Protocol request(Protocol::ENV_GET_DATABASE_NAMES_REQUEST); + request.mutable_env_get_database_names_request(); + request.mutable_env_get_database_names_request()->set_env_handle(m_remote_handle); + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_get_database_names_reply()); + + ham_status_t st = reply->env_get_database_names_reply().status(); + if (st) + return (st); + + /* copy the retrieved names */ + uint32_t i; + for (i = 0; + i < (uint32_t)reply->env_get_database_names_reply().names_size() + && i < *count; + i++) { + names[i] = (uint16_t)*(reply->mutable_env_get_database_names_reply()->mutable_names()->mutable_data() + i); + } + + *count = i; + return (0); +} + +ham_status_t +RemoteEnvironment::do_get_parameters(ham_parameter_t *param) +{ + static char filename[1024]; // TODO not threadsafe!! + ham_parameter_t *p = param; + + Protocol request(Protocol::ENV_GET_PARAMETERS_REQUEST); + request.mutable_env_get_parameters_request()->set_env_handle(m_remote_handle); + while (p && p->name != 0) { + request.mutable_env_get_parameters_request()->add_names(p->name); + p++; + } + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_get_parameters_reply()); + + ham_status_t st = reply->env_get_parameters_reply().status(); + if (st) + return (st); + + p = param; + while (p && p->name) { + switch (p->name) { + case HAM_PARAM_CACHESIZE: + ham_assert(reply->env_get_parameters_reply().has_cache_size()); + p->value = reply->env_get_parameters_reply().cache_size(); + break; + case HAM_PARAM_PAGESIZE: + ham_assert(reply->env_get_parameters_reply().has_page_size()); + p->value = reply->env_get_parameters_reply().page_size(); + break; + case HAM_PARAM_MAX_DATABASES: + ham_assert(reply->env_get_parameters_reply().has_max_env_databases()); + p->value = reply->env_get_parameters_reply().max_env_databases(); + break; + case HAM_PARAM_FLAGS: + ham_assert(reply->env_get_parameters_reply().has_flags()); + p->value = reply->env_get_parameters_reply().flags(); + break; + case HAM_PARAM_FILEMODE: + ham_assert(reply->env_get_parameters_reply().has_filemode()); + p->value = reply->env_get_parameters_reply().filemode(); + break; + case HAM_PARAM_FILENAME: + if (reply->env_get_parameters_reply().has_filename()) { + strncpy(filename, reply->env_get_parameters_reply().filename().c_str(), + sizeof(filename) - 1); + filename[sizeof(filename) - 1] = 0; + p->value = (uint64_t)(&filename[0]); + } + break; + default: + ham_trace(("unknown parameter %d", (int)p->name)); + break; + } + p++; + } + return (0); +} + +ham_status_t +RemoteEnvironment::do_flush(uint32_t flags) +{ + Protocol request(Protocol::ENV_FLUSH_REQUEST); + request.mutable_env_flush_request()->set_flags(flags); + request.mutable_env_flush_request()->set_env_handle(m_remote_handle); + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_flush_reply()); + + return (reply->env_flush_reply().status()); +} + +ham_status_t +RemoteEnvironment::do_create_db(Database **pdb, DatabaseConfiguration &config, + const ham_parameter_t *param) +{ + Protocol request(Protocol::ENV_CREATE_DB_REQUEST); + request.mutable_env_create_db_request()->set_env_handle(m_remote_handle); + request.mutable_env_create_db_request()->set_dbname(config.db_name); + request.mutable_env_create_db_request()->set_flags(config.flags); + + const ham_parameter_t *p = param; + if (p) { + for (; p->name; p++) { + request.mutable_env_create_db_request()->add_param_names(p->name); + request.mutable_env_create_db_request()->add_param_values(p->value); + } + } + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_create_db_reply()); + + ham_status_t st = reply->env_create_db_reply().status(); + if (st) + return (st); + + config.flags = reply->env_create_db_reply().db_flags(); + RemoteDatabase *rdb = new RemoteDatabase(this, config, + reply->env_create_db_reply().db_handle()); + + *pdb = rdb; + return (0); +} + +ham_status_t +RemoteEnvironment::do_open_db(Database **pdb, DatabaseConfiguration &config, + const ham_parameter_t *param) +{ + Protocol request(Protocol::ENV_OPEN_DB_REQUEST); + request.mutable_env_open_db_request()->set_env_handle(m_remote_handle); + request.mutable_env_open_db_request()->set_dbname(config.db_name); + request.mutable_env_open_db_request()->set_flags(config.flags); + + const ham_parameter_t *p = param; + if (p) { + for (; p->name; p++) { + request.mutable_env_open_db_request()->add_param_names(p->name); + request.mutable_env_open_db_request()->add_param_values(p->value); + } + } + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_open_db_reply()); + + ham_status_t st = reply->env_open_db_reply().status(); + if (st) + return (st); + + config.flags = reply->env_open_db_reply().db_flags(); + RemoteDatabase *rdb = new RemoteDatabase(this, config, + reply->env_open_db_reply().db_handle()); + + *pdb = rdb; + return (0); +} + +ham_status_t +RemoteEnvironment::do_rename_db( uint16_t oldname, uint16_t newname, + uint32_t flags) +{ + Protocol request(Protocol::ENV_RENAME_REQUEST); + request.mutable_env_rename_request()->set_env_handle(m_remote_handle); + request.mutable_env_rename_request()->set_oldname(oldname); + request.mutable_env_rename_request()->set_newname(newname); + request.mutable_env_rename_request()->set_flags(flags); + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_rename_reply()); + + return (reply->env_rename_reply().status()); +} + +ham_status_t +RemoteEnvironment::do_erase_db(uint16_t name, uint32_t flags) +{ + Protocol request(Protocol::ENV_ERASE_DB_REQUEST); + request.mutable_env_erase_db_request()->set_env_handle(m_remote_handle); + request.mutable_env_erase_db_request()->set_name(name); + request.mutable_env_erase_db_request()->set_flags(flags); + + ScopedPtr<Protocol> reply(perform_request(&request)); + + ham_assert(reply->has_env_erase_db_reply()); + + return (reply->env_erase_db_reply().status()); +} + +Transaction * +RemoteEnvironment::do_txn_begin(const char *name, uint32_t flags) +{ + SerializedWrapper request; + request.id = kTxnBeginRequest; + request.txn_begin_request.env_handle = m_remote_handle; + request.txn_begin_request.flags = flags; + if (name) { + request.txn_begin_request.name.value = (uint8_t *)name; + request.txn_begin_request.name.size = strlen(name) + 1; + } + + SerializedWrapper reply; + perform_request(&request, &reply); + ham_assert(reply.id == kTxnBeginReply); + + ham_status_t st = reply.txn_begin_reply.status; + if (st) + throw Exception(st); + + Transaction *txn = new RemoteTransaction(this, name, flags, + reply.txn_begin_reply.txn_handle); + m_txn_manager->begin(txn); + return (txn); +} + +ham_status_t +RemoteEnvironment::do_txn_commit(Transaction *txn, uint32_t flags) +{ + RemoteTransaction *rtxn = dynamic_cast<RemoteTransaction *>(txn); + + SerializedWrapper request; + request.id = kTxnCommitRequest; + request.txn_commit_request.txn_handle = rtxn->get_remote_handle(); + request.txn_commit_request.flags = flags; + + SerializedWrapper reply; + perform_request(&request, &reply); + ham_assert(reply.id == kTxnCommitReply); + + ham_status_t st = reply.txn_commit_reply.status; + if (st) + return (st); + + return (m_txn_manager->commit(txn, flags)); +} + +ham_status_t +RemoteEnvironment::do_txn_abort(Transaction *txn, uint32_t flags) +{ + RemoteTransaction *rtxn = dynamic_cast<RemoteTransaction *>(txn); + + SerializedWrapper request; + request.id = kTxnAbortRequest; + request.txn_abort_request.txn_handle = rtxn->get_remote_handle(); + request.txn_abort_request.flags = flags; + + SerializedWrapper reply; + perform_request(&request, &reply); + ham_assert(reply.id == kTxnAbortReply); + ham_status_t st = reply.txn_abort_reply.status; + if (st) + return (st); + + return (m_txn_manager->abort(txn, flags)); +} + +ham_status_t +RemoteEnvironment::do_close(uint32_t flags) +{ + Protocol request(Protocol::DISCONNECT_REQUEST); + request.mutable_disconnect_request()->set_env_handle(m_remote_handle); + + ScopedPtr<Protocol> reply(perform_request(&request)); + + // ignore the reply + + m_socket.close(); + m_remote_handle = 0; + return (0); +} + +void +RemoteEnvironment::do_fill_metrics(ham_env_metrics_t *metrics) const +{ + throw Exception(HAM_NOT_IMPLEMENTED); +} + +} // namespace hamsterdb + +#endif // HAM_ENABLE_REMOTE + diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_remote.h b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_remote.h new file mode 100644 index 0000000000..c45fd5b222 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_remote.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_ENV_REMOTE_H +#define HAM_ENV_REMOTE_H + +#ifdef HAM_ENABLE_REMOTE + +#include "0root/root.h" + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! +#include "1os/socket.h" +#include "1base/dynamic_array.h" +#include "2protobuf/protocol.h" +#include "2protoserde/messages.h" +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +// +// The Environment implementation for remote file access +// +class RemoteEnvironment : public Environment +{ + public: + // Constructor + RemoteEnvironment(EnvironmentConfiguration config); + + // Sends a |request| message with the Google Protocol Buffers API. Blocks + // till the reply was fully received. Returns the reply structure. + Protocol *perform_request(Protocol *request); + + // Sends |request| message with the builtin Serde API. Blocks till the + // reply was fully received. Fills |reply| with the received data. + void perform_request(SerializedWrapper *request, SerializedWrapper *reply); + + protected: + // Creates a new Environment (ham_env_create) + virtual ham_status_t do_create(); + + // Opens a new Environment (ham_env_open) + virtual ham_status_t do_open(); + + // Returns all database names (ham_env_get_database_names) + virtual ham_status_t do_get_database_names(uint16_t *names, + uint32_t *count); + + // Returns environment parameters and flags (ham_env_get_parameters) + virtual ham_status_t do_get_parameters(ham_parameter_t *param); + + // Flushes the environment and its databases to disk (ham_env_flush) + virtual ham_status_t do_flush(uint32_t flags); + + // Creates a new database in the environment (ham_env_create_db) + virtual ham_status_t do_create_db(Database **db, + DatabaseConfiguration &config, + const ham_parameter_t *param); + + // Opens an existing database in the environment (ham_env_open_db) + virtual ham_status_t do_open_db(Database **db, + DatabaseConfiguration &config, + const ham_parameter_t *param); + + // Renames a database in the Environment (ham_env_rename_db) + virtual ham_status_t do_rename_db(uint16_t oldname, uint16_t newname, + uint32_t flags); + + // Erases (deletes) a database from the Environment (ham_env_erase_db) + virtual ham_status_t do_erase_db(uint16_t name, uint32_t flags); + + // Begins a new transaction (ham_txn_begin) + virtual Transaction *do_txn_begin(const char *name, uint32_t flags); + + // Commits a transaction (ham_txn_commit) + virtual ham_status_t do_txn_commit(Transaction *txn, uint32_t flags); + + // Commits a transaction (ham_txn_abort) + virtual ham_status_t do_txn_abort(Transaction *txn, uint32_t flags); + + // Closes the Environment (ham_env_close) + virtual ham_status_t do_close(uint32_t flags); + + // Fills in the current metrics + virtual void do_fill_metrics(ham_env_metrics_t *metrics) const; + + private: + // the remote handle + uint64_t m_remote_handle; + + // the socket + Socket m_socket; + + // a buffer to avoid frequent memory allocations + ByteArray m_buffer; +}; + +} // namespace hamsterdb + +#endif // HAM_ENABLE_REMOTE + +#endif /* HAM_ENV_REMOTE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4env/env_test.h b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_test.h new file mode 100644 index 0000000000..0d9fa76cec --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4env/env_test.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: no + * @thread_safe: no + */ + +#ifndef HAM_ENV_TEST_H +#define HAM_ENV_TEST_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "4env/env.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class EnvironmentTest +{ + public: + // Constructor + EnvironmentTest(EnvironmentConfiguration &config) + : m_config(config) { + } + + // Returns the Environment's configuration + EnvironmentConfiguration &config() { + return (m_config); + } + + void set_filename(const std::string &filename) { + m_config.filename = filename; + } + + private: + // Reference to the Environment's configuration + EnvironmentConfiguration &m_config; +}; + +} // namespace hamsterdb + +#endif /* HAM_ENV_TEST_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn.h b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn.h new file mode 100644 index 0000000000..e38e6155dc --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn.h @@ -0,0 +1,298 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The hamsterdb Transaction implementation + * + * hamsterdb stores Transactions in volatile RAM (with an append-only journal + * in case the RAM is lost). Each Transaction and each modification *in* a + * Transaction is stored in a complex data structure. + * + * When a Database is created, it contains a BtreeIndex for persistent + * (committed and flushed) data, and a TransactionIndex for active Transactions + * and those Transactions which were committed but not yet flushed to disk. + * This TransactionTree is implemented as a binary search tree (see rb.h). + * + * Each node in the TransactionTree is implemented by TransactionNode. Each + * node is identified by its database key, and groups all modifications of this + * key (of all Transactions!). + * + * Each modification in the node is implemented by TransactionOperation. There + * is one such TransactionOperation for 'insert', 'erase' etc. The + * TransactionOperations form two linked lists - one stored in the Transaction + * ("all operations from this Transaction") and another one stored in the + * TransactionNode ("all operations on the same key"). + * + * All Transactions in an Environment for a linked list, where the tail is + * the chronologically newest Transaction and the head is the oldest + * (see Transaction::get_newer and Transaction::get_older). + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_TXN_H +#define HAM_TXN_H + +#include "0root/root.h" + +#include <string> + +// Always verify that a file of level N does not include headers > N! +#include "1base/dynamic_array.h" +#include "1base/error.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +// +// A helper structure; ham_txn_t is declared in ham/hamsterdb.h as an +// opaque C structure, but internally we use a C++ class. The ham_txn_t +// struct satisfies the C compiler, and internally we just cast the pointers. +// +struct ham_txn_t +{ + int dummy; +}; + +namespace hamsterdb { + +struct Context; +class Environment; + +// +// An abstract base class for a Transaction. Overwritten for local and +// remote implementations +// +class Transaction +{ + protected: + enum { + // Transaction was aborted + kStateAborted = 0x10000, + + // Transaction was committed + kStateCommitted = 0x20000 + }; + + public: + // Constructor; "begins" the Transaction + // supported flags: HAM_TXN_READ_ONLY, HAM_TXN_TEMPORARY + Transaction(Environment *env, const char *name, uint32_t flags) + : m_id(0), m_env(env), m_flags(flags), m_next(0), m_cursor_refcount(0) { + if (name) + m_name = name; + } + + // Destructor + virtual ~Transaction() { } + + // Commits the Transaction + virtual void commit(uint32_t flags = 0) = 0; + + // Aborts the Transaction + virtual void abort(uint32_t flags = 0) = 0; + + // Returns true if the Transaction was aborted + bool is_aborted() const { + return (m_flags & kStateAborted) != 0; + } + + // Returns true if the Transaction was committed + bool is_committed() const { + return (m_flags & kStateCommitted) != 0; + } + + // Returns the unique id of this Transaction + uint64_t get_id() const { + return (m_id); + } + + // Returns the environment pointer + Environment *get_env() const { + return (m_env); + } + + // Returns the txn name + const std::string &get_name() const { + return (m_name); + } + + // Returns the flags + uint32_t get_flags() const { + return (m_flags); + } + + // Returns the cursor refcount (numbers of Cursors using this Transaction) + uint32_t get_cursor_refcount() const { + return (m_cursor_refcount); + } + + // Increases the cursor refcount (numbers of Cursors using this Transaction) + void increase_cursor_refcount() { + m_cursor_refcount++; + } + + // Decreases the cursor refcount (numbers of Cursors using this Transaction) + void decrease_cursor_refcount() { + ham_assert(m_cursor_refcount > 0); + m_cursor_refcount--; + } + + // Returns the memory buffer for the key data. + // Used to allocate array in ham_find, ham_cursor_move etc. which is + // then returned to the user. + ByteArray &key_arena() { + return (m_key_arena); + } + + // Returns the memory buffer for the record data. + // Used to allocate array in ham_find, ham_cursor_move etc. which is + // then returned to the user. + ByteArray &record_arena() { + return (m_record_arena); + } + + // Returns the next Transaction in the linked list */ + Transaction *get_next() const { + return (m_next); + } + + // Sets the next Transaction in the linked list */ + void set_next(Transaction *n) { + m_next = n; + } + + protected: + // the id of this Transaction + uint64_t m_id; + + // the Environment pointer + Environment *m_env; + + // flags for this Transaction + uint32_t m_flags; + + // the Transaction name + std::string m_name; + + // the linked list of all transactions + Transaction *m_next; + + // reference counter for cursors (number of cursors attached to this txn) + uint32_t m_cursor_refcount; + + // this is where key->data points to when returning a key to the user + ByteArray m_key_arena; + + // this is where record->data points to when returning a record to the user + ByteArray m_record_arena; + + private: + friend class Journal; + + // Sets the unique id of this Transaction; the journal needs this to patch + // in the id when recovering a Transaction + void set_id(uint64_t id) { + m_id = id; + } +}; + + +// +// An abstract base class for the TransactionManager. Overwritten for local and +// remote implementations. +// +// The TransactionManager is part of the Environment and manages all +// Transactions. +// +class TransactionManager +{ + public: + // Constructor + TransactionManager(Environment *env) + : m_env(env), m_oldest_txn(0), m_newest_txn(0) { + } + + // Destructor + virtual ~TransactionManager() { } + + // Begins a new Transaction + virtual void begin(Transaction *txn) = 0; + + // Commits a Transaction; the derived subclass has to take care of + // flushing and/or releasing memory + virtual ham_status_t commit(Transaction *txn, uint32_t flags = 0) = 0; + + // Aborts a Transaction; the derived subclass has to take care of + // flushing and/or releasing memory + virtual ham_status_t abort(Transaction *txn, uint32_t flags = 0) = 0; + + // Flushes committed (queued) transactions + virtual void flush_committed_txns(Context *context = 0) = 0; + + // Returns the oldest transaction which not yet flushed to disk + Transaction *get_oldest_txn() { + return (m_oldest_txn); + } + + // Returns the newest transaction which not yet flushed to disk + Transaction *get_newest_txn() { + return (m_newest_txn); + } + + protected: + // Adds a new transaction to this Environment + void append_txn_at_tail(Transaction *txn) { + if (!m_newest_txn) { + ham_assert(m_oldest_txn == 0); + m_oldest_txn = txn; + m_newest_txn = txn; + } + else { + m_newest_txn->set_next(txn); + m_newest_txn = txn; + /* if there's no oldest txn (this means: all txn's but the + * current one were already flushed) then set this txn as + * the oldest txn */ + if (!m_oldest_txn) + m_oldest_txn = txn; + } + } + + // Removes a transaction from this Environment + void remove_txn_from_head(Transaction *txn) { + if (m_newest_txn == txn) + m_newest_txn = 0; + + ham_assert(m_oldest_txn == txn); + m_oldest_txn = txn->get_next(); + } + + // The Environment which created this TransactionManager + Environment *m_env; + + // The head of the transaction list (the oldest transaction) + Transaction *m_oldest_txn; + + // The tail of the transaction list (the youngest/newest transaction) + Transaction *m_newest_txn; +}; + +} // namespace hamsterdb + +#endif /* HAM_TXN_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_cursor.cc b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_cursor.cc new file mode 100644 index 0000000000..b91469239f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_cursor.cc @@ -0,0 +1,368 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3btree/btree_cursor.h" +#include "4db/db.h" +#include "4txn/txn.h" +#include "4txn/txn_cursor.h" +#include "4txn/txn_local.h" +#include "4env/env.h" +#include "4cursor/cursor.h" +#include "4context/context.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +void +TransactionCursor::clone(const TransactionCursor *other) +{ + m_coupled_op = 0; + m_coupled_next = 0; + m_coupled_previous = 0; + + if (!other->is_nil()) + couple_to_op(other->get_coupled_op()); +} + +void +TransactionCursor::set_to_nil() +{ + /* uncoupled cursor? remove from the txn_op structure */ + if (!is_nil()) { + TransactionOperation *op = get_coupled_op(); + if (op) + remove_cursor_from_op(op); + m_coupled_op = 0; + } + + /* otherwise cursor is already nil */ +} + +void +TransactionCursor::couple_to_op(TransactionOperation *op) +{ + set_to_nil(); + m_coupled_op = op; + + m_coupled_next = op->cursor_list(); + m_coupled_previous = 0; + + if (op->cursor_list()) { + TransactionCursor *old = op->cursor_list(); + old->m_coupled_previous = this; + } + + op->set_cursor_list(this); +} + +ham_status_t +TransactionCursor::overwrite(Context *context, LocalTransaction *txn, + ham_record_t *record) +{ + ham_assert(context->txn == txn); + + if (is_nil()) + return (HAM_CURSOR_IS_NIL); + + TransactionNode *node = m_coupled_op->get_node(); + + /* an overwrite is actually an insert w/ HAM_OVERWRITE of the + * current key */ + return (((LocalDatabase *)get_db())->insert_txn(context, node->get_key(), + record, HAM_OVERWRITE, this)); +} + +ham_status_t +TransactionCursor::move_top_in_node(TransactionNode *node, + TransactionOperation *op, bool ignore_conflicts, uint32_t flags) +{ + Transaction *optxn = 0; + + if (!op) + op = node->get_newest_op(); + else + goto next; + + while (op) { + optxn = op->get_txn(); + /* only look at ops from the current transaction and from + * committed transactions */ + if (optxn == m_parent->get_txn() || optxn->is_committed()) { + /* a normal (overwriting) insert will return this key */ + if ((op->get_flags() & TransactionOperation::kInsert) + || (op->get_flags() & TransactionOperation::kInsertOverwrite)) { + couple_to_op(op); + return (0); + } + /* retrieve a duplicate key */ + if (op->get_flags() & TransactionOperation::kInsertDuplicate) { + /* the duplicates are handled by the caller. here we only + * couple to the first op */ + couple_to_op(op); + return (0); + } + /* a normal erase will return an error (but we still couple the + * cursor because the caller might need to know WHICH key was + * deleted!) */ + if (op->get_flags() & TransactionOperation::kErase) { + couple_to_op(op); + return (HAM_KEY_ERASED_IN_TXN); + } + /* everything else is a bug! */ + ham_assert(op->get_flags() == TransactionOperation::kNop); + } + else if (optxn->is_aborted()) + ; /* nop */ + else if (!ignore_conflicts) { + /* we still have to couple, because higher-level functions + * will need to know about the op when consolidating the trees */ + couple_to_op(op); + return (HAM_TXN_CONFLICT); + } + +next: + m_parent->set_dupecache_index(0); + op = op->get_previous_in_node(); + } + + return (HAM_KEY_NOT_FOUND); +} + +ham_status_t +TransactionCursor::move(uint32_t flags) +{ + ham_status_t st; + TransactionNode *node; + + if (flags & HAM_CURSOR_FIRST) { + /* first set cursor to nil */ + set_to_nil(); + + node = get_db()->txn_index()->get_first(); + if (!node) + return (HAM_KEY_NOT_FOUND); + return (move_top_in_node(node, 0, false, flags)); + } + else if (flags & HAM_CURSOR_LAST) { + /* first set cursor to nil */ + set_to_nil(); + + node = get_db()->txn_index()->get_last(); + if (!node) + return (HAM_KEY_NOT_FOUND); + return (move_top_in_node(node, 0, false, flags)); + } + else if (flags & HAM_CURSOR_NEXT) { + if (is_nil()) + return (HAM_CURSOR_IS_NIL); + + node = m_coupled_op->get_node(); + + ham_assert(!is_nil()); + + /* first move to the next key in the current node; if we fail, + * then move to the next node. repeat till we've found a key or + * till we've reached the end of the tree */ + while (1) { + node = node->get_next_sibling(); + if (!node) + return (HAM_KEY_NOT_FOUND); + st = move_top_in_node(node, 0, true, flags); + if (st == HAM_KEY_NOT_FOUND) + continue; + return (st); + } + } + else if (flags & HAM_CURSOR_PREVIOUS) { + if (is_nil()) + return (HAM_CURSOR_IS_NIL); + + node = m_coupled_op->get_node(); + + ham_assert(!is_nil()); + + /* first move to the previous key in the current node; if we fail, + * then move to the previous node. repeat till we've found a key or + * till we've reached the end of the tree */ + while (1) { + node = node->get_previous_sibling(); + if (!node) + return (HAM_KEY_NOT_FOUND); + st = move_top_in_node(node, 0, true, flags); + if (st == HAM_KEY_NOT_FOUND) + continue; + return (st); + } + } + else { + ham_assert(!"this flag is not yet implemented"); + } + + return (0); +} + +ham_status_t +TransactionCursor::find(ham_key_t *key, uint32_t flags) +{ + TransactionNode *node = 0; + + /* first set cursor to nil */ + set_to_nil(); + + /* then lookup the node */ + if (get_db()->txn_index()) + node = get_db()->txn_index()->get(key, flags); + if (!node) + return (HAM_KEY_NOT_FOUND); + + while (1) { + /* and then move to the newest insert*-op */ + ham_status_t st = move_top_in_node(node, 0, false, 0); + if (st != HAM_KEY_ERASED_IN_TXN) + return (st); + + /* if the key was erased and approx. matching is enabled, then move + * next/prev till we found a valid key. */ + if (flags & HAM_FIND_GT_MATCH) + node = node->get_next_sibling(); + else if (flags & HAM_FIND_LT_MATCH) + node = node->get_previous_sibling(); + else + return (st); + + if (!node) + return (HAM_KEY_NOT_FOUND); + } + + ham_assert(!"should never reach this"); + return (0); +} + +void +TransactionCursor::copy_coupled_key(ham_key_t *key) +{ + Transaction *txn = m_parent->get_txn(); + ham_key_t *source = 0; + + ByteArray *arena = &get_db()->key_arena(txn); + + /* coupled cursor? get key from the txn_op structure */ + if (!is_nil()) { + TransactionNode *node = m_coupled_op->get_node(); + + ham_assert(get_db() == node->get_db()); + source = node->get_key(); + + key->size = source->size; + if (source->data && source->size) { + if (!(key->flags & HAM_KEY_USER_ALLOC)) { + arena->resize(source->size); + key->data = arena->get_ptr(); + } + memcpy(key->data, source->data, source->size); + } + else + key->data = 0; + return; + } + + /* otherwise cursor is nil and we cannot return a key */ + throw Exception(HAM_CURSOR_IS_NIL); +} + +void +TransactionCursor::copy_coupled_record(ham_record_t *record) +{ + ham_record_t *source = 0; + Transaction *txn = m_parent->get_txn(); + + ByteArray *arena = &get_db()->record_arena(txn); + + /* coupled cursor? get record from the txn_op structure */ + if (!is_nil()) { + source = m_coupled_op->get_record(); + + record->size = source->size; + if (source->data && source->size) { + if (!(record->flags & HAM_RECORD_USER_ALLOC)) { + arena->resize(source->size); + record->data = arena->get_ptr(); + } + memcpy(record->data, source->data, source->size); + } + else + record->data = 0; + return; + } + + /* otherwise cursor is nil and we cannot return a key */ + throw Exception(HAM_CURSOR_IS_NIL); +} + +uint64_t +TransactionCursor::get_record_size() +{ + /* coupled cursor? get record from the txn_op structure */ + if (!is_nil()) + return (m_coupled_op->get_record()->size); + + /* otherwise cursor is nil and we cannot return a key */ + throw Exception(HAM_CURSOR_IS_NIL); +} + +LocalDatabase * +TransactionCursor::get_db() +{ + return (m_parent->get_db()); +} + +ham_status_t +TransactionCursor::test_insert(ham_key_t *key, ham_record_t *record, + uint32_t flags) +{ + LocalTransaction *txn = dynamic_cast<LocalTransaction *>(m_parent->get_txn()); + Context context(get_db()->lenv(), txn, get_db()); + + return (get_db()->insert_txn(&context, key, record, flags, this)); +} + +void +TransactionCursor::remove_cursor_from_op(TransactionOperation *op) +{ + ham_assert(!is_nil()); + + if (op->cursor_list() == this) { + op->set_cursor_list(m_coupled_next); + if (m_coupled_next) + m_coupled_next->m_coupled_previous = 0; + } + else { + if (m_coupled_next) + m_coupled_next->m_coupled_previous = m_coupled_previous; + if (m_coupled_previous) + m_coupled_previous->m_coupled_next = m_coupled_next; + } + m_coupled_next = 0; + m_coupled_previous = 0; +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_cursor.h b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_cursor.h new file mode 100644 index 0000000000..d2f4462f76 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_cursor.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A cursor which can iterate over transaction nodes and operations + * + * A Transaction Cursor can walk over Transaction trees (TransactionIndex). + * + * Transaction Cursors are only used as part of the Cursor structure as defined + * in cursor.h. Like all Transaction operations it is in-memory only, + * traversing the red-black tree that is implemented in txn.h, and + * consolidating multiple operations in a node (i.e. if a Transaction first + * overwrites a record, and another transaction then erases the key). + * + * The Transaction Cursor has two states: either it is coupled to a + * Transaction operation (TransactionOperation) or it is unused. + * + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_TXN_CURSOR_H +#define HAM_TXN_CURSOR_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "4txn/txn_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +class Cursor; +struct Context; + +// +// An cursor which can iterate over Transaction nodes +// +class TransactionCursor +{ + public: + // Constructor + TransactionCursor(Cursor *parent) + : m_parent(parent) { + m_coupled_op = 0; + m_coupled_next = 0; + m_coupled_previous = 0; + } + + // Destructor; asserts that the cursor is nil + ~TransactionCursor() { + ham_assert(is_nil()); + } + + // Clones another TransactionCursor + void clone(const TransactionCursor *other); + + // Returns the parent cursor + // TODO this should be private + Cursor *get_parent() { + return (m_parent); + } + + // Couples this cursor to a TransactionOperation structure + void couple_to_op(TransactionOperation *op); + + // Returns the pointer to the coupled TransactionOperation + TransactionOperation *get_coupled_op() const { + return (m_coupled_op); + } + + // Sets the cursor to nil + void set_to_nil(); + + // Returns true if the cursor is nil (does not point to any item) + bool is_nil() const { + return (m_coupled_op == 0); + } + + // Retrieves the key from the current item; creates a deep copy. + // + // If the cursor is uncoupled, HAM_CURSOR_IS_NIL is returned. this + // means that the item was already flushed to the btree, and the caller has + // to use the btree lookup function to retrieve the key. + void copy_coupled_key(ham_key_t *key); + + // Retrieves the record from the current item; creates a deep copy. + // + // If the cursor is uncoupled, HAM_CURSOR_IS_NIL will be returned. this + // means that the item was already flushed to the btree, and the caller has + // to use the btree lookup function to retrieve the record. + void copy_coupled_record(ham_record_t *record); + + // Moves the cursor to first, last, previous or next + ham_status_t move(uint32_t flags); + + // Overwrites the record of a cursor + ham_status_t overwrite(Context *context, LocalTransaction *txn, + ham_record_t *record); + + // Looks up an item, places the cursor + ham_status_t find(ham_key_t *key, uint32_t flags); + + // Retrieves the record size of the current item + uint64_t get_record_size(); + + // Returns the pointer to the next cursor in the linked list of coupled + // cursors + TransactionCursor *get_coupled_next() { + return (m_coupled_next); + } + + // Closes the cursor + void close() { + set_to_nil(); + } + + private: + friend struct TxnCursorFixture; + + // Removes this cursor from this TransactionOperation + void remove_cursor_from_op(TransactionOperation *op); + + // Inserts an item, places the cursor on the new item. + // This function is only used in the unittests. + ham_status_t test_insert(ham_key_t *key, ham_record_t *record, + uint32_t flags); + + // Returns the database pointer + LocalDatabase *get_db(); + + // Moves the cursor to the first valid Operation in a Node + ham_status_t move_top_in_node(TransactionNode *node, + TransactionOperation *op, bool ignore_conflicts, + uint32_t flags); + + // The parent cursor + Cursor *m_parent; + + // A Cursor can either be coupled or nil ("not in list"). If it's + // coupled, it directly points to a TransactionOperation structure. + // If it's nil then |m_coupled_op| is null. + // + // the txn operation to which we're pointing + TransactionOperation *m_coupled_op; + + // a double linked list with other cursors that are coupled + // to the same Operation + TransactionCursor *m_coupled_next, *m_coupled_previous; +}; + +} // namespace hamsterdb + +#endif /* HAM_TXN_CURSOR_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_factory.h b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_factory.h new file mode 100644 index 0000000000..2738f1b4d7 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_factory.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A factory to create TransactionOperation and TransactionNode instances. + * + * @exception_safe: strong + * @thread_safe: yes + */ + +#ifndef HAM_TXN_FACTORY_H +#define HAM_TXN_FACTORY_H + +#include "0root/root.h" + +#include "ham/types.h" + +// Always verify that a file of level N does not include headers > N! +#include "1mem/mem.h" +#include "4txn/txn.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct TransactionFactory +{ + // Creates a new TransactionOperation + static TransactionOperation *create_operation(LocalTransaction *txn, + TransactionNode *node, uint32_t flags, uint32_t orig_flags, + uint64_t lsn, ham_key_t *key, ham_record_t *record) { + TransactionOperation *op; + op = Memory::allocate<TransactionOperation>(sizeof(*op) + + (record ? record->size : 0) + + (key ? key->size : 0)); + op->initialize(txn, node, flags, orig_flags, lsn, key, record); + return (op); + } + + // Destroys a TransactionOperation + static void destroy_operation(TransactionOperation *op) { + op->destroy(); + } +}; + +} // namespace hamsterdb + +#endif /* HAM_TXN_FACTORY_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_local.cc b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_local.cc new file mode 100644 index 0000000000..8014b6330f --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_local.cc @@ -0,0 +1,676 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "3btree/btree_index.h" +#include "3journal/journal.h" +#include "4txn/txn_local.h" +#include "4txn/txn_factory.h" +#include "4txn/txn_cursor.h" +#include "4env/env_local.h" +#include "4cursor/cursor.h" +#include "4context/context.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +/* stuff for rb.h */ +#ifndef __ssize_t_defined +typedef signed ssize_t; +#endif +#ifndef __cplusplus +typedef int bool; +#define true 1 +#define false (!true) +#endif /* __cpluscplus */ + +static int +compare(void *vlhs, void *vrhs) +{ + TransactionNode *lhs = (TransactionNode *)vlhs; + TransactionNode *rhs = (TransactionNode *)vrhs; + LocalDatabase *db = lhs->get_db(); + + if (lhs == rhs) + return (0); + + ham_key_t *lhskey = lhs->get_key(); + ham_key_t *rhskey = rhs->get_key(); + ham_assert(lhskey && rhskey); + return (db->btree_index()->compare_keys(lhskey, rhskey)); +} + +rb_proto(static, rbt_, TransactionIndex, TransactionNode) +rb_gen(static, rbt_, TransactionIndex, TransactionNode, node, compare) + +void +TransactionOperation::initialize(LocalTransaction *txn, TransactionNode *node, + uint32_t flags, uint32_t orig_flags, uint64_t lsn, + ham_key_t *key, ham_record_t *record) +{ + memset(this, 0, sizeof(*this)); + + m_txn = txn; + m_node = node; + m_flags = flags; + m_lsn = lsn; + m_orig_flags = orig_flags; + + /* copy the key data */ + if (key) { + m_key = *key; + if (key->size) { + m_key.data = &m_data[0]; + memcpy(m_key.data, key->data, key->size); + } + } + + /* copy the record data */ + if (record) { + m_record = *record; + if (record->size) { + m_record.data = &m_data[key ? key->size : 0]; + memcpy(m_record.data, record->data, record->size); + } + } +} + +void +TransactionOperation::destroy() +{ + bool delete_node = false; + + /* remove this op from the node */ + TransactionNode *node = get_node(); + if (node->get_oldest_op() == this) { + /* if the node is empty: remove the node from the tree */ + // TODO should this be done in here?? + if (get_next_in_node() == 0) { + node->get_db()->txn_index()->remove(node); + delete_node = true; + } + node->set_oldest_op(get_next_in_node()); + } + + /* remove this operation from the two linked lists */ + TransactionOperation *next = get_next_in_node(); + TransactionOperation *prev = get_previous_in_node(); + if (next) + next->set_previous_in_node(prev); + if (prev) + prev->set_next_in_node(next); + + next = get_next_in_txn(); + prev = get_previous_in_txn(); + if (next) + next->set_previous_in_txn(prev); + if (prev) + prev->set_next_in_txn(next); + + if (delete_node) + delete node; + + Memory::release(this); +} + +TransactionNode * +TransactionNode::get_next_sibling() +{ + return (rbt_next(get_db()->txn_index(), this)); +} + +TransactionNode * +TransactionNode::get_previous_sibling() +{ + return (rbt_prev(get_db()->txn_index(), this)); +} + +TransactionNode::TransactionNode(LocalDatabase *db, ham_key_t *key) + : m_db(db), m_oldest_op(0), m_newest_op(0), m_key(key) +{ + /* make sure that a node with this key does not yet exist */ + // TODO re-enable this; currently leads to a stack overflow because + // TransactionIndex::get() creates a new TransactionNode + // ham_assert(TransactionIndex::get(key, 0) == 0); +} + +TransactionNode::~TransactionNode() +{ +} + +TransactionOperation * +TransactionNode::append(LocalTransaction *txn, uint32_t orig_flags, + uint32_t flags, uint64_t lsn, ham_key_t *key, + ham_record_t *record) +{ + TransactionOperation *op = TransactionFactory::create_operation(txn, + this, flags, orig_flags, lsn, + key, record); + + /* store it in the chronological list which is managed by the node */ + if (!get_newest_op()) { + ham_assert(get_oldest_op() == 0); + set_newest_op(op); + set_oldest_op(op); + } + else { + TransactionOperation *newest = get_newest_op(); + newest->set_next_in_node(op); + op->set_previous_in_node(newest); + set_newest_op(op); + } + + /* store it in the chronological list which is managed by the transaction */ + if (!txn->get_newest_op()) { + ham_assert(txn->get_oldest_op() == 0); + txn->set_newest_op(op); + txn->set_oldest_op(op); + } + else { + TransactionOperation *newest = txn->get_newest_op(); + newest->set_next_in_txn(op); + op->set_previous_in_txn(newest); + txn->set_newest_op(op); + } + + // now that an operation is attached make sure that the node no + // longer uses the temporary key pointer + m_key = 0; + + return (op); +} + +void +TransactionIndex::store(TransactionNode *node) +{ + rbt_insert(this, node); +} + +void +TransactionIndex::remove(TransactionNode *node) +{ +#ifdef HAM_DEBUG + bool found = false; + TransactionNode *n = rbt_first(this); + while (n) { + if (n == node) { + found = true; + break; + } + n = rbt_next(this, n); + } + ham_assert(found == true); +#endif + + rbt_remove(this, node); +} + +LocalTransactionManager::LocalTransactionManager(Environment *env) + : TransactionManager(env), m_txn_id(0), m_queued_txn_for_flush(0), + m_queued_ops_for_flush(0), m_queued_bytes_for_flush(0), + m_txn_threshold(kFlushTxnThreshold), + m_ops_threshold(kFlushOperationsThreshold), + m_bytes_threshold(kFlushBytesThreshold) +{ + if (m_env->get_flags() & HAM_FLUSH_WHEN_COMMITTED) { + m_txn_threshold = 0; + m_ops_threshold = 0; + m_bytes_threshold = 0; + } +} + +LocalTransaction::LocalTransaction(LocalEnvironment *env, const char *name, + uint32_t flags) + : Transaction(env, name, flags), m_log_desc(0), m_oldest_op(0), + m_newest_op(0), m_op_counter(0), m_accum_data_size(0) +{ + LocalTransactionManager *ltm = + (LocalTransactionManager *)env->txn_manager(); + m_id = ltm->get_incremented_txn_id(); + + /* append journal entry */ + if (env->get_flags() & HAM_ENABLE_RECOVERY + && env->get_flags() & HAM_ENABLE_TRANSACTIONS + && !(flags & HAM_TXN_TEMPORARY)) { + env->journal()->append_txn_begin(this, name, + env->next_lsn()); + } +} + +LocalTransaction::~LocalTransaction() +{ + free_operations(); +} + +void +LocalTransaction::commit(uint32_t flags) +{ + /* are cursors attached to this txn? if yes, fail */ + if (get_cursor_refcount()) { + ham_trace(("Transaction cannot be committed till all attached " + "Cursors are closed")); + throw Exception(HAM_CURSOR_STILL_OPEN); + } + + /* this transaction is now committed! */ + m_flags |= kStateCommitted; +} + +void +LocalTransaction::abort(uint32_t flags) +{ + /* are cursors attached to this txn? if yes, fail */ + if (get_cursor_refcount()) { + ham_trace(("Transaction cannot be aborted till all attached " + "Cursors are closed")); + throw Exception(HAM_CURSOR_STILL_OPEN); + } + + /* this transaction is now aborted! */ + m_flags |= kStateAborted; + + /* immediately release memory of the cached operations */ + free_operations(); +} + +void +LocalTransaction::free_operations() +{ + TransactionOperation *n, *op = get_oldest_op(); + + while (op) { + n = op->get_next_in_txn(); + TransactionFactory::destroy_operation(op); + op = n; + } + + set_oldest_op(0); + set_newest_op(0); +} + +TransactionIndex::TransactionIndex(LocalDatabase *db) + : m_db(db) +{ + rbt_new(this); +} + +TransactionIndex::~TransactionIndex() +{ + TransactionNode *node; + + while ((node = rbt_last(this))) { + remove(node); + delete node; + } + + // re-initialize the tree + rbt_new(this); +} + +TransactionNode * +TransactionIndex::get(ham_key_t *key, uint32_t flags) +{ + TransactionNode *node = 0; + int match = 0; + + /* create a temporary node that we can search for */ + TransactionNode tmp(m_db, key); + + /* search if node already exists - if yes, return it */ + if ((flags & HAM_FIND_GEQ_MATCH) == HAM_FIND_GEQ_MATCH) { + node = rbt_nsearch(this, &tmp); + if (node) + match = compare(&tmp, node); + } + else if ((flags & HAM_FIND_LEQ_MATCH) == HAM_FIND_LEQ_MATCH) { + node = rbt_psearch(this, &tmp); + if (node) + match = compare(&tmp, node); + } + else if (flags & HAM_FIND_GT_MATCH) { + node = rbt_search(this, &tmp); + if (node) + node = node->get_next_sibling(); + else + node = rbt_nsearch(this, &tmp); + match = 1; + } + else if (flags & HAM_FIND_LT_MATCH) { + node = rbt_search(this, &tmp); + if (node) + node = node->get_previous_sibling(); + else + node = rbt_psearch(this, &tmp); + match = -1; + } + else + return (rbt_search(this, &tmp)); + + /* tree is empty? */ + if (!node) + return (0); + + /* approx. matching: set the key flag */ + if (match < 0) + ham_key_set_intflags(key, (ham_key_get_intflags(key) + & ~BtreeKey::kApproximate) | BtreeKey::kLower); + else if (match > 0) + ham_key_set_intflags(key, (ham_key_get_intflags(key) + & ~BtreeKey::kApproximate) | BtreeKey::kGreater); + + return (node); +} + +TransactionNode * +TransactionIndex::get_first() +{ + return (rbt_first(this)); +} + +TransactionNode * +TransactionIndex::get_last() +{ + return (rbt_last(this)); +} + +void +TransactionIndex::enumerate(Context *context, + TransactionIndex::Visitor *visitor) +{ + TransactionNode *node = rbt_first(this); + + while (node) { + visitor->visit(context, node); + node = rbt_next(this, node); + } +} + +struct KeyCounter : public TransactionIndex::Visitor +{ + KeyCounter(LocalDatabase *_db, LocalTransaction *_txn, bool _distinct) + : counter(0), distinct(_distinct), txn(_txn), db(_db) { + } + + void visit(Context *context, TransactionNode *node) { + BtreeIndex *be = db->btree_index(); + TransactionOperation *op; + + /* + * look at each tree_node and walk through each operation + * in reverse chronological order (from newest to oldest): + * - is this op part of an aborted txn? then skip it + * - is this op part of a committed txn? then include it + * - is this op part of an txn which is still active? then include it + * - if a committed txn has erased the item then there's no need + * to continue checking older, committed txns of the same key + * + * !! + * if keys are overwritten or a duplicate key is inserted, then + * we have to consolidate the btree keys with the txn-tree keys. + */ + op = node->get_newest_op(); + while (op) { + LocalTransaction *optxn = op->get_txn(); + if (optxn->is_aborted()) + ; // nop + else if (optxn->is_committed() || txn == optxn) { + if (op->get_flags() & TransactionOperation::kIsFlushed) + ; // nop + // if key was erased then it doesn't exist + else if (op->get_flags() & TransactionOperation::kErase) + return; + else if (op->get_flags() & TransactionOperation::kInsert) { + counter++; + return; + } + // key exists - include it + else if ((op->get_flags() & TransactionOperation::kInsert) + || (op->get_flags() & TransactionOperation::kInsertOverwrite)) { + // check if the key already exists in the btree - if yes, + // we do not count it (it will be counted later) + if (HAM_KEY_NOT_FOUND == be->find(context, 0, node->get_key(), 0, 0, 0, 0)) + counter++; + return; + } + else if (op->get_flags() & TransactionOperation::kInsertDuplicate) { + // check if btree has other duplicates + if (0 == be->find(context, 0, node->get_key(), 0, 0, 0, 0)) { + // yes, there's another one + if (distinct) + return; + counter++; + } + else { + // check if other key is in this node + counter++; + if (distinct) + return; + } + } + else if (!(op->get_flags() & TransactionOperation::kNop)) { + ham_assert(!"shouldn't be here"); + return; + } + } + else { // txn is still active + counter++; + } + + op = op->get_previous_in_node(); + } + } + + uint64_t counter; + bool distinct; + LocalTransaction *txn; + LocalDatabase *db; +}; + +uint64_t +TransactionIndex::count(Context *context, LocalTransaction *txn, bool distinct) +{ + KeyCounter k(m_db, txn, distinct); + enumerate(context, &k); + return (k.counter); +} + +void +LocalTransactionManager::begin(Transaction *txn) +{ + append_txn_at_tail(txn); +} + +ham_status_t +LocalTransactionManager::commit(Transaction *htxn, uint32_t flags) +{ + LocalTransaction *txn = dynamic_cast<LocalTransaction *>(htxn); + Context context(lenv(), txn, 0); + + try { + txn->commit(flags); + + /* append journal entry */ + if (m_env->get_flags() & HAM_ENABLE_RECOVERY + && m_env->get_flags() & HAM_ENABLE_TRANSACTIONS + && !(txn->get_flags() & HAM_TXN_TEMPORARY)) + lenv()->journal()->append_txn_commit(txn, + lenv()->next_lsn()); + + /* flush committed transactions */ + m_queued_txn_for_flush++; + m_queued_ops_for_flush += txn->get_op_counter(); + m_queued_bytes_for_flush += txn->get_accum_data_size(); + maybe_flush_committed_txns(&context); + } + catch (Exception &ex) { + return (ex.code); + } + return (0); +} + +ham_status_t +LocalTransactionManager::abort(Transaction *htxn, uint32_t flags) +{ + LocalTransaction *txn = dynamic_cast<LocalTransaction *>(htxn); + Context context(lenv(), txn, 0); + + try { + txn->abort(flags); + + /* append journal entry */ + if (m_env->get_flags() & HAM_ENABLE_RECOVERY + && m_env->get_flags() & HAM_ENABLE_TRANSACTIONS + && !(txn->get_flags() & HAM_TXN_TEMPORARY)) + lenv()->journal()->append_txn_abort(txn, + lenv()->next_lsn()); + + /* flush committed transactions; while this one was not committed, + * we might have cleared the way now to flush other committed + * transactions */ + m_queued_txn_for_flush++; + + /* no need to increment m_queued_{ops,bytes}_for_flush because this + * operation does no longer contain any operations */ + maybe_flush_committed_txns(&context); + } + catch (Exception &ex) { + return (ex.code); + } + return (0); +} + +void +LocalTransactionManager::maybe_flush_committed_txns(Context *context) +{ + if (m_queued_txn_for_flush > m_txn_threshold + || m_queued_ops_for_flush > m_ops_threshold + || m_queued_bytes_for_flush > m_bytes_threshold) + flush_committed_txns_impl(context); +} + +void +LocalTransactionManager::flush_committed_txns(Context *context /* = 0 */) +{ + if (!context) { + Context new_context(lenv(), 0, 0); + flush_committed_txns_impl(&new_context); + } + else + flush_committed_txns_impl(context); +} + +void +LocalTransactionManager::flush_committed_txns_impl(Context *context) +{ + LocalTransaction *oldest; + Journal *journal = lenv()->journal(); + uint64_t highest_lsn = 0; + + ham_assert(context->changeset.is_empty()); + + /* always get the oldest transaction; if it was committed: flush + * it; if it was aborted: discard it; otherwise return */ + while ((oldest = (LocalTransaction *)get_oldest_txn())) { + if (oldest->is_committed()) { + m_queued_ops_for_flush -= oldest->get_op_counter(); + ham_assert(m_queued_ops_for_flush >= 0); + m_queued_bytes_for_flush -= oldest->get_accum_data_size(); + ham_assert(m_queued_bytes_for_flush >= 0); + uint64_t lsn = flush_txn(context, (LocalTransaction *)oldest); + if (lsn > highest_lsn) + highest_lsn = lsn; + + /* this transaction was flushed! */ + if (journal && (oldest->get_flags() & HAM_TXN_TEMPORARY) == 0) + journal->transaction_flushed(oldest); + } + else if (oldest->is_aborted()) { + ; /* nop */ + } + else + break; + + /* it's possible that Transactions were aborted directly, and not through + * the TransactionManager (i.e. in Journal::abort_uncommitted_txns). + * so don't rely on m_queued_txn_for_flush, it might be zero */ + if (m_queued_txn_for_flush > 0) + m_queued_txn_for_flush--; + + /* now remove the txn from the linked list */ + remove_txn_from_head(oldest); + + /* and release the memory */ + delete oldest; + } + + /* now flush the changeset and write the modified pages to disk */ + if (highest_lsn && m_env->get_flags() & HAM_ENABLE_RECOVERY) + context->changeset.flush(highest_lsn); + else + context->changeset.clear(); + + ham_assert(context->changeset.is_empty()); +} + +uint64_t +LocalTransactionManager::flush_txn(Context *context, LocalTransaction *txn) +{ + TransactionOperation *op = txn->get_oldest_op(); + TransactionCursor *cursor = 0; + uint64_t highest_lsn = 0; + + while (op) { + TransactionNode *node = op->get_node(); + + if (op->get_flags() & TransactionOperation::kIsFlushed) + goto next_op; + + // perform the actual operation in the btree + node->get_db()->flush_txn_operation(context, txn, op); + + /* + * this op is about to be flushed! + * + * as a consequence, all (txn)cursors which are coupled to this op + * have to be uncoupled, as their parent (btree) cursor was + * already coupled to the btree item instead + */ + op->set_flushed(); +next_op: + while ((cursor = op->cursor_list())) { + Cursor *pc = cursor->get_parent(); + ham_assert(pc->get_txn_cursor() == cursor); + pc->couple_to_btree(); // TODO merge both calls? + if (!pc->is_nil(Cursor::kTxn)) + pc->set_to_nil(Cursor::kTxn); + } + + ham_assert(op->get_lsn() > highest_lsn); + highest_lsn = op->get_lsn(); + + /* continue with the next operation of this txn */ + op = op->get_next_in_txn(); + } + + return (highest_lsn); +} + +} // namespace hamsterdb diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_local.h b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_local.h new file mode 100644 index 0000000000..cfb563466a --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_local.h @@ -0,0 +1,566 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_TXN_LOCAL_H +#define HAM_TXN_LOCAL_H + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "1rb/rb.h" +#include "4txn/txn.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; +class TransactionNode; +class TransactionIndex; +class TransactionCursor; +class LocalTransaction; +class LocalDatabase; +class LocalEnvironment; + + +// +// The TransactionOperation class describes a single operation (i.e. +// insert or erase) in a Transaction. +// +class TransactionOperation +{ + public: + enum { + // a NOP operation (empty) + kNop = 0x000000u, + + // txn operation is an insert + kInsert = 0x010000u, + + // txn operation is an insert w/ overwrite + kInsertOverwrite = 0x020000u, + + // txn operation is an insert w/ duplicate + kInsertDuplicate = 0x040000u, + + // txn operation erases the key + kErase = 0x080000u, + + // txn operation was already flushed + kIsFlushed = 0x100000u + }; + + // Returns the flags + uint32_t get_flags() const { + return (m_flags); + } + + // This Operation was flushed to disk + void set_flushed() { + m_flags |= kIsFlushed; + } + + // Returns the original flags of ham_insert/ham_cursor_insert/ham_erase... + uint32_t get_orig_flags() const { + return (m_orig_flags); + } + + // Returns the referenced duplicate id + uint32_t get_referenced_dupe() const { + return (m_referenced_dupe); + } + + // Sets the referenced duplicate id + void set_referenced_dupe(uint32_t id) { + m_referenced_dupe = id; + } + + // Returns a pointer to the Transaction of this update + LocalTransaction *get_txn() { + return (m_txn); + } + + // Returns a pointer to the parent node of this update */ + TransactionNode *get_node() { + return (m_node); + } + + // Returns the lsn of this operation + uint64_t get_lsn() const { + return (m_lsn); + } + + // Returns the key of this operation + ham_key_t *get_key() { + return (&m_key); + } + + // Returns the record of this operation + ham_record_t *get_record() { + return (&m_record); + } + + // Returns the list of Cursors coupled to this operation + TransactionCursor *cursor_list() { + return (m_cursor_list); + } + + // Sets the list of Cursors coupled to this operation + void set_cursor_list(TransactionCursor *cursors) { + m_cursor_list = cursors; + } + + // Returns the next TransactionOperation which modifies the + // same TransactionNode + TransactionOperation *get_next_in_node() { + return (m_node_next); + } + + // Returns the previous TransactionOperation which modifies the + // same TransactionNode + TransactionOperation *get_previous_in_node() { + return (m_node_prev); + } + + // Returns the next TransactionOperation in the same Transaction + TransactionOperation *get_next_in_txn() { + return (m_txn_next); + } + + // Returns the previous TransactionOperation in the same Transaction + TransactionOperation *get_previous_in_txn() { + return (m_txn_prev); + } + + private: + friend class TransactionNode; + friend struct TransactionFactory; + + // Initialization + void initialize(LocalTransaction *txn, TransactionNode *node, + uint32_t flags, uint32_t orig_flags, uint64_t lsn, + ham_key_t *key, ham_record_t *record); + + // Destructor + void destroy(); + + // Sets the next TransactionOperation which modifies the + // same TransactionNode + void set_next_in_node(TransactionOperation *next) { + m_node_next = next; + } + + // Sets the previous TransactionOperation which modifies the + // same TransactionNode + void set_previous_in_node(TransactionOperation *prev) { + m_node_prev = prev; + } + + // Sets the next TransactionOperation in the same Transaction + void set_next_in_txn(TransactionOperation *next) { + m_txn_next = next; + } + + // Sets the previous TransactionOperation in the same Transaction + void set_previous_in_txn(TransactionOperation *prev) { + m_txn_prev = prev; + } + + // the Transaction of this operation + LocalTransaction *m_txn; + + // the parent node + TransactionNode *m_node; + + // flags and type of this operation; defined in this file + uint32_t m_flags; + + // the original flags of this operation, used when calling + // ham_cursor_insert, ham_insert, ham_erase etc + uint32_t m_orig_flags; + + // the referenced duplicate id (if neccessary) - used if this is + // i.e. a ham_cursor_erase, ham_cursor_overwrite or ham_cursor_insert + // with a DUPLICATE_AFTER/BEFORE flag + // this is 1-based (like dupecache-index, which is also 1-based) + uint32_t m_referenced_dupe; + + // the log serial number (lsn) of this operation + uint64_t m_lsn; + + // a linked list of cursors which are attached to this operation + TransactionCursor *m_cursor_list; + + // next in linked list (managed in TransactionNode) + TransactionOperation *m_node_next; + + // previous in linked list (managed in TransactionNode) + TransactionOperation *m_node_prev; + + // next in linked list (managed in Transaction) + TransactionOperation *m_txn_next; + + // previous in linked list (managed in Transaction) + TransactionOperation *m_txn_prev; + + // the key which is inserted or overwritten + ham_key_t m_key; + + // the record which is inserted or overwritten + ham_record_t m_record; + + // Storage for record->data. This saves us one memory allocation. + uint8_t m_data[1]; +}; + + +// +// A node in the Transaction Index, used as the node structure in rb.h. +// Manages a group of TransactionOperation objects which all modify the +// same key. +// +// To avoid chicken-egg problems when inserting a new TransactionNode +// into the TransactionTree, it is possible to assign a temporary key +// to this node. However, as soon as an operation is attached to this node, +// the TransactionNode class will use the key structure in this operation. +// +// This basically avoids one memory allocation. +// +class TransactionNode +{ + public: + // Constructor; + // The default parameters are required for the compilation of rb.h. + // |key| is just a temporary pointer which allows to create a + // TransactionNode without further memory allocations/copying. The actual + // key is then fetched from |m_oldest_op| as soon as this node is fully + // initialized. + TransactionNode(LocalDatabase *db = 0, ham_key_t *key = 0); + + // Destructor; removes this node from the tree, unless |dont_insert| + // was set to true + ~TransactionNode(); + + // Returns the database + LocalDatabase *get_db() { + return (m_db); + } + + // Returns the modified key + ham_key_t *get_key() { + return (m_oldest_op ? m_oldest_op->get_key() : m_key); + } + + // Retrieves the next larger sibling of a given node, or NULL if there + // is no sibling + TransactionNode *get_next_sibling(); + + // Retrieves the previous larger sibling of a given node, or NULL if there + // is no sibling + TransactionNode *get_previous_sibling(); + + // Returns the first (oldest) TransactionOperation in this node + TransactionOperation *get_oldest_op() { + return (m_oldest_op); + }; + + // Sets the first (oldest) TransactionOperation in this node + void set_oldest_op(TransactionOperation *oldest) { + m_oldest_op = oldest; + } + + // Returns the last (newest) TransactionOperation in this node + TransactionOperation *get_newest_op() { + return (m_newest_op); + }; + + // Sets the last (newest) TransactionOperation in this node + void set_newest_op(TransactionOperation *newest) { + m_newest_op = newest; + } + + // Appends an actual operation to this node + TransactionOperation *append(LocalTransaction *txn, uint32_t orig_flags, + uint32_t flags, uint64_t lsn, ham_key_t *key, + ham_record_t *record); + + // red-black tree stub, required for rb.h + rb_node(TransactionNode) node; + + private: + friend struct TxnFixture; + + // the database - need this to get the compare function + LocalDatabase *m_db; + + // the linked list of operations - head is oldest operation + TransactionOperation *m_oldest_op; + + // the linked list of operations - tail is newest operation + TransactionOperation *m_newest_op; + + // Pointer to the key data; only used as long as there are no operations + // attached. Otherwise we have a chicken-egg problem in rb.h. + ham_key_t *m_key; +}; + + +// +// Each Database has a binary tree which stores the current Transaction +// operations; this tree is implemented in TransactionIndex +// +class TransactionIndex +{ + public: + // Traverses a TransactionIndex; for each node, a callback is executed + struct Visitor { + virtual void visit(Context *context, TransactionNode *node) = 0; + }; + + // Constructor + TransactionIndex(LocalDatabase *db); + + // Destructor; frees all nodes and their operations + ~TransactionIndex(); + + // Stores a new TransactionNode in the index + void store(TransactionNode *node); + + // Removes a TransactionNode from the index + void remove(TransactionNode *node); + + // Visits every node in the TransactionTree + void enumerate(Context *context, Visitor *visitor); + + // Returns an opnode for an optree; if a node with this + // key already exists then the existing node is returned, otherwise NULL. + // |flags| can be HAM_FIND_GEQ_MATCH, HAM_FIND_LEQ_MATCH etc + TransactionNode *get(ham_key_t *key, uint32_t flags); + + // Returns the first (= "smallest") node of the tree, or NULL if the + // tree is empty + TransactionNode *get_first(); + + // Returns the last (= "greatest") node of the tree, or NULL if the + // tree is empty + TransactionNode *get_last(); + + // Returns the key count of this index + uint64_t count(Context *context, LocalTransaction *txn, bool distinct); + + // private: //TODO re-enable this; currently disabled because rb.h needs it + // the Database for all operations in this tree + LocalDatabase *m_db; + + // stuff for rb.h + TransactionNode *rbt_root; + TransactionNode rbt_nil; +}; + + +// +// A local Transaction +// +class LocalTransaction : public Transaction +{ + public: + // Constructor; "begins" the Transaction + // supported flags: HAM_TXN_READ_ONLY, HAM_TXN_TEMPORARY + LocalTransaction(LocalEnvironment *env, const char *name, uint32_t flags); + + // Destructor; frees all TransactionOperation structures associated + // with this Transaction + virtual ~LocalTransaction(); + + // Commits the Transaction + void commit(uint32_t flags = 0); + + // Aborts the Transaction + void abort(uint32_t flags = 0); + + // Returns the first (or 'oldest') TransactionOperation of this Transaction + TransactionOperation *get_oldest_op() const { + return (m_oldest_op); + } + + // Sets the first (or 'oldest') TransactionOperation of this Transaction + void set_oldest_op(TransactionOperation *op) { + m_oldest_op = op; + } + + // Returns the last (or 'newest') TransactionOperation of this Transaction + TransactionOperation *get_newest_op() const { + return (m_newest_op); + } + + // Sets the last (or 'newest') TransactionOperation of this Transaction + void set_newest_op(TransactionOperation *op) { + if (op) { + m_op_counter++; + m_accum_data_size += op->get_record() + ? op->get_record()->size + : 0; + m_accum_data_size += op->get_node()->get_key()->size; + } + m_newest_op = op; + } + + // Returns the number of operations attached to this Transaction + int get_op_counter() const { + return (m_op_counter); + } + + // Returns the accumulated data size of all operations + int get_accum_data_size() const { + return (m_accum_data_size); + } + + private: + friend class Journal; + friend struct TxnFixture; + friend struct TxnCursorFixture; + + // Frees the internal structures; releases all the memory. This is + // called in the destructor, but also when aborting a Transaction + // (before it's deleted by the Environment). + void free_operations(); + + // Returns the index of the journal's log file descriptor + int get_log_desc() const { + return (m_log_desc); + } + + // Sets the index of the journal's log file descriptor + void set_log_desc(int desc) { + m_log_desc = desc; + } + + // index of the log file descriptor for this transaction [0..1] + int m_log_desc; + + // the linked list of operations - head is oldest operation + TransactionOperation *m_oldest_op; + + // the linked list of operations - tail is newest operation + TransactionOperation *m_newest_op; + + // For counting the operations + int m_op_counter; + + // The approximate accumulated memory consumed by this Transaction + // (sums up key->size and record->size over all operations) + int m_accum_data_size; +}; + + +// +// A TransactionManager for local Transactions +// +class LocalTransactionManager : public TransactionManager +{ + enum { + // flush if this limit is exceeded + kFlushTxnThreshold = 64, + + // flush if this limit is exceeded + kFlushOperationsThreshold = kFlushTxnThreshold * 20, + + // flush if this limit is exceeded + kFlushBytesThreshold = 1024 * 1024 // 1 mb - same as journal buffer + }; + + public: + // Constructor + LocalTransactionManager(Environment *env); + + // Begins a new Transaction + virtual void begin(Transaction *txn); + + // Commits a Transaction; the derived subclass has to take care of + // flushing and/or releasing memory + virtual ham_status_t commit(Transaction *txn, uint32_t flags = 0); + + // Aborts a Transaction; the derived subclass has to take care of + // flushing and/or releasing memory + virtual ham_status_t abort(Transaction *txn, uint32_t flags = 0); + + // Flushes committed (queued) transactions + virtual void flush_committed_txns(Context *context = 0); + + // Increments the global transaction ID and returns the new value. + uint64_t get_incremented_txn_id() { + return (++m_txn_id); + } + + // Returns the current transaction ID; only for testing! + uint64_t test_get_txn_id() const { + return (m_txn_id); + } + + // Sets the current transaction ID; used by the Journal to + // reset the original txn id during recovery. + void set_txn_id(uint64_t id) { + m_txn_id = id; + } + + private: + void flush_committed_txns_impl(Context *context); + + // Flushes a single committed Transaction; returns the lsn of the + // last operation in this transaction + uint64_t flush_txn(Context *context, LocalTransaction *txn); + + // Casts m_env to a LocalEnvironment + LocalEnvironment *lenv() { + return ((LocalEnvironment *)m_env); + } + + // Flushes committed transactions if there are enough committed + // transactions waiting to be flushed, or if other conditions apply + void maybe_flush_committed_txns(Context *context); + + // The current transaction ID + uint64_t m_txn_id; + + // Number of Transactions waiting to be flushed + int m_queued_txn_for_flush; + + // Combined number of Operations in these transactions waiting to be flushed + int m_queued_ops_for_flush; + + // Approx. memory consumption of all these operations in the flush queue + int m_queued_bytes_for_flush; + + // Threshold for transactio queue + int m_txn_threshold; + + // Threshold for transactio queue + int m_ops_threshold; + + // Threshold for transactio queue + int m_bytes_threshold; +}; + +} // namespace hamsterdb + +#endif /* HAM_TXN_LOCAL_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_remote.cc b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_remote.cc new file mode 100644 index 0000000000..2d4403b077 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_remote.cc @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAM_ENABLE_REMOTE + +#include "0root/root.h" + +#include <string.h> + +// Always verify that a file of level N does not include headers > N! +#include "2protobuf/protocol.h" +#include "4txn/txn_remote.h" +#include "4env/env_remote.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +RemoteTransaction::RemoteTransaction(Environment *env, const char *name, + uint32_t flags, uint64_t remote_handle) + : Transaction(env, name, flags), m_remote_handle(remote_handle) +{ +} + +void +RemoteTransaction::commit(uint32_t flags) +{ + /* There's nothing else to do for this Transaction, therefore set it + * to 'aborted' (although it was committed) */ + m_flags |= kStateAborted; +} + +void +RemoteTransaction::abort(uint32_t flags) +{ + /* this transaction is now aborted! */ + m_flags |= kStateAborted; +} + +void +RemoteTransactionManager::begin(Transaction *txn) +{ + append_txn_at_tail(txn); +} + +ham_status_t +RemoteTransactionManager::commit(Transaction *txn, uint32_t flags) +{ + try { + txn->commit(flags); + + /* "flush" (remove) committed and aborted transactions */ + flush_committed_txns(); + } + catch (Exception &ex) { + return (ex.code); + } + return (0); +} + +ham_status_t +RemoteTransactionManager::abort(Transaction *txn, uint32_t flags) +{ + try { + txn->abort(flags); + + /* "flush" (remove) committed and aborted transactions */ + flush_committed_txns(); + } + catch (Exception &ex) { + return (ex.code); + } + return (0); +} + +void +RemoteTransactionManager::flush_committed_txns(Context *context /* = 0 */) +{ + Transaction *oldest; + + while ((oldest = get_oldest_txn())) { + if (oldest->is_committed() || oldest->is_aborted()) { + remove_txn_from_head(oldest); + delete oldest; + } + else + return; + } +} + +} // namespace hamsterdb + +#endif // HAM_ENABLE_REMOTE diff --git a/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_remote.h b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_remote.h new file mode 100644 index 0000000000..4c7d6f46e5 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/4txn/txn_remote.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * @exception_safe: unknown + * @thread_safe: unknown + */ + +#ifndef HAM_TXN_REMOTE_H +#define HAM_TXN_REMOTE_H + +#ifdef HAM_ENABLE_REMOTE + +#include "0root/root.h" + +// Always verify that a file of level N does not include headers > N! +#include "4txn/txn.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +namespace hamsterdb { + +struct Context; + +// +// A remote Transaction +// +class RemoteTransaction : public Transaction +{ + public: + // Constructor; "begins" the Transaction + // supported flags: HAM_TXN_READ_ONLY, HAM_TXN_TEMPORARY + RemoteTransaction(Environment *env, const char *name, uint32_t flags, + uint64_t remote_handle); + + // Commits the Transaction + virtual void commit(uint32_t flags = 0); + + // Aborts the Transaction + virtual void abort(uint32_t flags = 0); + + // Returns the remote Transaction handle + uint64_t get_remote_handle() const { + return (m_remote_handle); + } + + private: + // The remote Transaction handle + uint64_t m_remote_handle; +}; + + +// +// A TransactionManager for remote Transactions +// +class RemoteTransactionManager : public TransactionManager +{ + public: + // Constructor + RemoteTransactionManager(Environment *env) + : TransactionManager(env) { + } + + // Begins a new Transaction + virtual void begin(Transaction *txn); + + // Commits a Transaction; the derived subclass has to take care of + // flushing and/or releasing memory + virtual ham_status_t commit(Transaction *txn, uint32_t flags = 0); + + // Aborts a Transaction; the derived subclass has to take care of + // flushing and/or releasing memory + virtual ham_status_t abort(Transaction *txn, uint32_t flags = 0); + + // Flushes committed (queued) transactions + virtual void flush_committed_txns(Context *context = 0); +}; + +} // namespace hamsterdb + +#endif // HAM_ENABLE_REMOTE + +#endif /* HAM_TXN_REMOTE_H */ diff --git a/plugins/Dbx_kv/src/hamsterdb/src/5hamsterdb/hamsterdb.cc b/plugins/Dbx_kv/src/hamsterdb/src/5hamsterdb/hamsterdb.cc new file mode 100644 index 0000000000..ed366ed374 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/5hamsterdb/hamsterdb.cc @@ -0,0 +1,1633 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include <stdlib.h> +#include <string.h> + +#include "ham/hamsterdb.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "1base/dynamic_array.h" +#include "1mem/mem.h" +#include "2config/db_config.h" +#include "2config/env_config.h" +#include "2page/page.h" +#ifdef HAM_ENABLE_REMOTE +# include "2protobuf/protocol.h" +#endif +#include "2device/device.h" +#include "3btree/btree_stats.h" +#include "3blob_manager/blob_manager.h" +#include "3btree/btree_index.h" +#include "3btree/btree_cursor.h" +#include "4cursor/cursor.h" +#include "4db/db.h" +#include "4env/env.h" +#include "4env/env_header.h" +#include "4env/env_local.h" +#include "4env/env_remote.h" +#include "4txn/txn.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +/* return true if the filename is for a local file */ +static bool +filename_is_local(const char *filename) +{ + return (!filename || strstr(filename, "ham://") != filename); +} + +ham_status_t +ham_txn_begin(ham_txn_t **htxn, ham_env_t *henv, const char *name, + void *, uint32_t flags) +{ + Transaction **ptxn = (Transaction **)htxn; + + if (!ptxn) { + ham_trace(("parameter 'txn' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + *ptxn = 0; + + if (!henv) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Environment *env = (Environment *)henv; + + return (env->txn_begin(ptxn, name, flags)); +} + +HAM_EXPORT const char * +ham_txn_get_name(ham_txn_t *htxn) +{ + Transaction *txn = (Transaction *)htxn; + if (!txn) + return (0); + + const std::string &name = txn->get_env()->txn_get_name(txn); + return (name.empty() ? 0 : name.c_str()); +} + +ham_status_t +ham_txn_commit(ham_txn_t *htxn, uint32_t flags) +{ + Transaction *txn = (Transaction *)htxn; + if (!txn) { + ham_trace(("parameter 'txn' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Environment *env = txn->get_env(); + + return (env->txn_commit(txn, flags)); +} + +ham_status_t +ham_txn_abort(ham_txn_t *htxn, uint32_t flags) +{ + Transaction *txn = (Transaction *)htxn; + if (!txn) { + ham_trace(("parameter 'txn' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Environment *env = txn->get_env(); + + return (env->txn_abort(txn, flags)); +} + +const char * HAM_CALLCONV +ham_strerror(ham_status_t result) +{ + switch (result) { + case HAM_SUCCESS: + return ("Success"); + case HAM_INV_KEY_SIZE: + return ("Invalid key size"); + case HAM_INV_RECORD_SIZE: + return ("Invalid record size"); + case HAM_INV_PAGESIZE: + return ("Invalid page size"); + case HAM_OUT_OF_MEMORY: + return ("Out of memory"); + case HAM_INV_PARAMETER: + return ("Invalid parameter"); + case HAM_INV_FILE_HEADER: + return ("Invalid database file header"); + case HAM_INV_FILE_VERSION: + return ("Invalid database file version"); + case HAM_KEY_NOT_FOUND: + return ("Key not found"); + case HAM_DUPLICATE_KEY: + return ("Duplicate key"); + case HAM_INTEGRITY_VIOLATED: + return ("Internal integrity violated"); + case HAM_INTERNAL_ERROR: + return ("Internal error"); + case HAM_WRITE_PROTECTED: + return ("Database opened in read-only mode"); + case HAM_BLOB_NOT_FOUND: + return ("Data blob not found"); + case HAM_IO_ERROR: + return ("System I/O error"); + case HAM_NOT_IMPLEMENTED: + return ("Operation not implemented"); + case HAM_FILE_NOT_FOUND: + return ("File not found"); + case HAM_WOULD_BLOCK: + return ("Operation would block"); + case HAM_NOT_READY: + return ("Object was not initialized correctly"); + case HAM_CURSOR_STILL_OPEN: + return ("Cursor must be closed prior to Transaction abort/commit"); + case HAM_FILTER_NOT_FOUND: + return ("Record filter or file filter not found"); + case HAM_TXN_CONFLICT: + return ("Operation conflicts with another Transaction"); + case HAM_TXN_STILL_OPEN: + return ("Database cannot be closed because it is modified in a " + "Transaction"); + case HAM_CURSOR_IS_NIL: + return ("Cursor points to NIL"); + case HAM_DATABASE_NOT_FOUND: + return ("Database not found"); + case HAM_DATABASE_ALREADY_EXISTS: + return ("Database name already exists"); + case HAM_DATABASE_ALREADY_OPEN: + return ("Database already open, or: Database handle " + "already initialized"); + case HAM_ENVIRONMENT_ALREADY_OPEN: + return ("Environment already open, or: Environment handle " + "already initialized"); + case HAM_LIMITS_REACHED: + return ("Database limits reached"); + case HAM_ALREADY_INITIALIZED: + return ("Object was already initialized"); + case HAM_NEED_RECOVERY: + return ("Database needs recovery"); + case HAM_LOG_INV_FILE_HEADER: + return ("Invalid log file header"); + case HAM_NETWORK_ERROR: + return ("Remote I/O error/Network error"); + default: + return ("Unknown error"); + } +} + +/** + * Prepares a @ref ham_key_t structure for returning key data in. + * + * This function checks whether the @ref ham_key_t structure has been + * properly initialized by the user and resets all internal used elements. + * + * @return true when the @a key structure has been initialized correctly + * before. + * + * @return false when the @a key structure has @e not been initialized + * correctly before. + */ +static inline bool +__prepare_key(ham_key_t *key) +{ + if (unlikely(key->size && !key->data)) { + ham_trace(("key->size != 0, but key->data is NULL")); + return (false); + } + if (unlikely(key->flags != 0 && key->flags != HAM_KEY_USER_ALLOC)) { + ham_trace(("invalid flag in key->flags")); + return (false); + } + key->_flags = 0; + return (true); +} + +/** + * Prepares a @ref ham_record_t structure for returning record data in. + * + * This function checks whether the @ref ham_record_t structure has been + * properly initialized by the user and resets all internal used elements. + * + * @return true when the @a record structure has been initialized + * correctly before. + * + * @return false when the @a record structure has @e not been + * initialized correctly before. + */ +static inline bool +__prepare_record(ham_record_t *record) +{ + if (unlikely(record->size && !record->data)) { + ham_trace(("record->size != 0, but record->data is NULL")); + return false; + } + if (unlikely(record->flags & HAM_DIRECT_ACCESS)) + record->flags &= ~HAM_DIRECT_ACCESS; + if (unlikely(record->flags != 0 && record->flags != HAM_RECORD_USER_ALLOC)) { + ham_trace(("invalid flag in record->flags")); + return (false); + } + return (true); +} + +void HAM_CALLCONV +ham_get_version(uint32_t *major, uint32_t *minor, uint32_t *revision) +{ + if (major) + *major = HAM_VERSION_MAJ; + if (minor) + *minor = HAM_VERSION_MIN; + if (revision) + *revision = HAM_VERSION_REV; +} + +ham_status_t HAM_CALLCONV +ham_env_create(ham_env_t **henv, const char *filename, + uint32_t flags, uint32_t mode, const ham_parameter_t *param) +{ + EnvironmentConfiguration config; + config.filename = filename ? filename : ""; + config.file_mode = mode; + + if (!henv) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + *henv = 0; + + /* creating a file in READ_ONLY mode? doesn't make sense */ + if (flags & HAM_READ_ONLY) { + ham_trace(("cannot create a file in read-only mode")); + return (HAM_INV_PARAMETER); + } + + /* in-memory? recovery is not possible */ + if ((flags & HAM_IN_MEMORY) && (flags & HAM_ENABLE_RECOVERY)) { + ham_trace(("combination of HAM_IN_MEMORY and HAM_ENABLE_RECOVERY " + "not allowed")); + return (HAM_INV_PARAMETER); + } + + if (flags & HAM_ENABLE_CRC32) { + ham_trace(("Crc32 is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + } + + /* HAM_ENABLE_TRANSACTIONS implies HAM_ENABLE_RECOVERY, unless explicitly + * disabled */ + if ((flags & HAM_ENABLE_TRANSACTIONS) && !(flags & HAM_DISABLE_RECOVERY)) + flags |= HAM_ENABLE_RECOVERY; + + /* flag HAM_AUTO_RECOVERY implies HAM_ENABLE_RECOVERY */ + if (flags & HAM_AUTO_RECOVERY) + flags |= HAM_ENABLE_RECOVERY; + + /* in-memory with Transactions? disable recovery */ + if (flags & HAM_IN_MEMORY) + flags &= ~HAM_ENABLE_RECOVERY; + + if (param) { + for (; param->name; param++) { + switch (param->name) { + case HAM_PARAM_JOURNAL_COMPRESSION: + ham_trace(("Journal compression is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_CACHE_SIZE: + if (flags & HAM_IN_MEMORY && param->value != 0) { + ham_trace(("combination of HAM_IN_MEMORY and cache size != 0 " + "not allowed")); + return (HAM_INV_PARAMETER); + } + /* don't allow cache limits with unlimited cache */ + if (flags & HAM_CACHE_UNLIMITED && param->value != 0) { + ham_trace(("combination of HAM_CACHE_UNLIMITED and cache size != 0 " + "not allowed")); + return (HAM_INV_PARAMETER); + } + if (param->value > 0) + config.cache_size_bytes = (size_t)param->value; + break; + case HAM_PARAM_PAGE_SIZE: + if (param->value != 1024 && param->value % 2048 != 0) { + ham_trace(("invalid page size - must be 1024 or a multiple of 2048")); + return (HAM_INV_PAGESIZE); + } + if (param->value > 0) + config.page_size_bytes = (uint32_t)param->value; + break; + case HAM_PARAM_FILE_SIZE_LIMIT: + if (param->value > 0) + config.file_size_limit_bytes = (size_t)param->value; + break; + case HAM_PARAM_JOURNAL_SWITCH_THRESHOLD: + config.journal_switch_threshold = (uint32_t)param->value; + break; + case HAM_PARAM_LOG_DIRECTORY: + config.log_filename = (const char *)param->value; + break; + case HAM_PARAM_NETWORK_TIMEOUT_SEC: + config.remote_timeout_sec = (uint32_t)param->value; + break; + case HAM_PARAM_ENCRYPTION_KEY: + ham_trace(("Encryption is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_POSIX_FADVISE: + config.posix_advice = (int)param->value; + break; + default: + ham_trace(("unknown parameter %d", (int)param->name)); + return (HAM_INV_PARAMETER); + } + } + } + + if (config.filename.empty() && !(flags & HAM_IN_MEMORY)) { + ham_trace(("filename is missing")); + return (HAM_INV_PARAMETER); + } + + config.flags = flags; + + /* + * make sure that max_databases actually fit in a header + * page! + * leave at least 128 bytes for other header data + */ + config.max_databases = config.page_size_bytes + - sizeof(PEnvironmentHeader) - 128; + config.max_databases /= sizeof(PBtreeHeader); + + ham_status_t st = 0; + Environment *env = 0; + + if (filename_is_local(config.filename.c_str())) { + env = new LocalEnvironment(config); + } + else { +#ifndef HAM_ENABLE_REMOTE + return (HAM_NOT_IMPLEMENTED); +#else // HAM_ENABLE_REMOTE + env = new RemoteEnvironment(config); +#endif + } + +#ifdef HAM_ENABLE_REMOTE + atexit(Protocol::shutdown); +#endif + + /* and finish the initialization of the Environment */ + st = env->create(); + + /* flush the environment to make sure that the header page is written + * to disk TODO required?? */ + if (st == 0) + st = env->flush(0); + + if (st) { + env->close(HAM_AUTO_CLEANUP); + delete env; + return (st); + } + + *henv = (ham_env_t *)env; + return (0); +} + +ham_status_t HAM_CALLCONV +ham_env_create_db(ham_env_t *henv, ham_db_t **hdb, uint16_t db_name, + uint32_t flags, const ham_parameter_t *param) +{ + Environment *env = (Environment *)henv; + DatabaseConfiguration config; + + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + *hdb = 0; + + if (!db_name || (db_name >= 0xf000)) { + ham_trace(("invalid database name")); + return (HAM_INV_PARAMETER); + } + + config.db_name = db_name; + config.flags = flags; + + return (env->create_db((Database **)hdb, config, param)); +} + +ham_status_t HAM_CALLCONV +ham_env_open_db(ham_env_t *henv, ham_db_t **hdb, uint16_t db_name, + uint32_t flags, const ham_parameter_t *param) +{ + Environment *env = (Environment *)henv; + DatabaseConfiguration config; + + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + *hdb = 0; + + if (!db_name) { + ham_trace(("parameter 'db_name' must not be 0")); + return (HAM_INV_PARAMETER); + } + if (db_name >= 0xf000) { + ham_trace(("database name must be lower than 0xf000")); + return (HAM_INV_PARAMETER); + } + if (env->get_flags() & HAM_IN_MEMORY) { + ham_trace(("cannot open a Database in an In-Memory Environment")); + return (HAM_INV_PARAMETER); + } + + config.flags = flags; + config.db_name = db_name; + + return (env->open_db((Database **)hdb, config, param)); +} + +ham_status_t HAM_CALLCONV +ham_env_open(ham_env_t **henv, const char *filename, uint32_t flags, + const ham_parameter_t *param) +{ + EnvironmentConfiguration config; + config.filename = filename ? filename : ""; + + if (!henv) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + *henv = 0; + + /* cannot open an in-memory-db */ + if (flags & HAM_IN_MEMORY) { + ham_trace(("cannot open an in-memory database")); + return (HAM_INV_PARAMETER); + } + + /* HAM_ENABLE_DUPLICATE_KEYS has to be specified in ham_env_create_db, + * not ham_env_open */ + if (flags & HAM_ENABLE_DUPLICATE_KEYS) { + ham_trace(("invalid flag HAM_ENABLE_DUPLICATE_KEYS (only allowed when " + "creating a database")); + return (HAM_INV_PARAMETER); + } + + if (flags & HAM_ENABLE_CRC32) { + ham_trace(("Crc32 is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + } + + /* HAM_ENABLE_TRANSACTIONS implies HAM_ENABLE_RECOVERY, unless explicitly + * disabled */ + if ((flags & HAM_ENABLE_TRANSACTIONS) && !(flags & HAM_DISABLE_RECOVERY)) + flags |= HAM_ENABLE_RECOVERY; + + /* flag HAM_AUTO_RECOVERY implies HAM_ENABLE_RECOVERY */ + if (flags & HAM_AUTO_RECOVERY) + flags |= HAM_ENABLE_RECOVERY; + + if (config.filename.empty() && !(flags & HAM_IN_MEMORY)) { + ham_trace(("filename is missing")); + return (HAM_INV_PARAMETER); + } + + if (param) { + for (; param->name; param++) { + switch (param->name) { + case HAM_PARAM_JOURNAL_COMPRESSION: + ham_trace(("Journal compression is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_CACHE_SIZE: + /* don't allow cache limits with unlimited cache */ + if (flags & HAM_CACHE_UNLIMITED && param->value != 0) { + ham_trace(("combination of HAM_CACHE_UNLIMITED and cache size != 0 " + "not allowed")); + return (HAM_INV_PARAMETER); + } + if (param->value > 0) + config.cache_size_bytes = param->value; + break; + case HAM_PARAM_FILE_SIZE_LIMIT: + if (param->value > 0) + config.file_size_limit_bytes = (size_t)param->value; + break; + case HAM_PARAM_JOURNAL_SWITCH_THRESHOLD: + config.journal_switch_threshold = (uint32_t)param->value; + break; + case HAM_PARAM_LOG_DIRECTORY: + config.log_filename = (const char *)param->value; + break; + case HAM_PARAM_NETWORK_TIMEOUT_SEC: + config.remote_timeout_sec = (uint32_t)param->value; + break; + case HAM_PARAM_ENCRYPTION_KEY: + ham_trace(("Encryption is only available in hamsterdb pro")); + return (HAM_NOT_IMPLEMENTED); + case HAM_PARAM_POSIX_FADVISE: + config.posix_advice = (int)param->value; + break; + default: + ham_trace(("unknown parameter %d", (int)param->name)); + return (HAM_INV_PARAMETER); + } + } + } + + config.flags = flags; + + ham_status_t st = 0; + Environment *env = 0; + + if (filename_is_local(config.filename.c_str())) { + env = new LocalEnvironment(config); + } + else { +#ifndef HAM_ENABLE_REMOTE + return (HAM_NOT_IMPLEMENTED); +#else // HAM_ENABLE_REMOTE + env = new RemoteEnvironment(config); +#endif + } + +#ifdef HAM_ENABLE_REMOTE + atexit(Protocol::shutdown); +#endif + + /* and finish the initialization of the Environment */ + st = env->open(); + + if (st) { + (void)env->close(HAM_AUTO_CLEANUP); + delete env; + return (st); + } + + *henv = (ham_env_t *)env; + return (0); +} + +ham_status_t HAM_CALLCONV +ham_env_rename_db(ham_env_t *henv, uint16_t oldname, uint16_t newname, + uint32_t flags) +{ + Environment *env = (Environment *)henv; + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (!oldname) { + ham_trace(("parameter 'oldname' must not be 0")); + return (HAM_INV_PARAMETER); + } + if (!newname) { + ham_trace(("parameter 'newname' must not be 0")); + return (HAM_INV_PARAMETER); + } + if (newname >= 0xf000) { + ham_trace(("parameter 'newname' must be lower than 0xf000")); + return (HAM_INV_PARAMETER); + } + + /* no need to do anything if oldname==newname */ + if (oldname == newname) + return (0); + + /* rename the database */ + return (env->rename_db(oldname, newname, flags)); +} + +ham_status_t HAM_CALLCONV +ham_env_erase_db(ham_env_t *henv, uint16_t name, uint32_t flags) +{ + Environment *env = (Environment *)henv; + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (!name) { + ham_trace(("parameter 'name' must not be 0")); + return (HAM_INV_PARAMETER); + } + + /* erase the database */ + return (env->erase_db(name, flags)); +} + +ham_status_t HAM_CALLCONV +ham_env_get_database_names(ham_env_t *henv, uint16_t *names, uint32_t *count) +{ + Environment *env = (Environment *)henv; + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (!names) { + ham_trace(("parameter 'names' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!count) { + ham_trace(("parameter 'count' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + /* get all database names */ + return (env->get_database_names(names, count)); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_env_get_parameters(ham_env_t *henv, ham_parameter_t *param) +{ + Environment *env = (Environment *)henv; + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (!param) { + ham_trace(("parameter 'param' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + /* get the parameters */ + return (env->get_parameters(param)); +} + +ham_status_t HAM_CALLCONV +ham_env_flush(ham_env_t *henv, uint32_t flags) +{ + Environment *env = (Environment *)henv; + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (flags && flags != HAM_FLUSH_COMMITTED_TRANSACTIONS) { + ham_trace(("parameter 'flags' is unused, set to 0")); + return (HAM_INV_PARAMETER); + } + + /* flush the Environment */ + return (env->flush(flags)); +} + +ham_status_t HAM_CALLCONV +ham_env_close(ham_env_t *henv, uint32_t flags) +{ + ham_status_t st; + Environment *env = (Environment *)henv; + + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + try { + /* close the environment */ + st = env->close(flags); + if (st) + return (st); + + delete env; + return (0); + } + catch (Exception &ex) { + return (ex.code); + } +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_get_parameters(ham_db_t *hdb, ham_parameter_t *param) +{ + Database *db = (Database *)hdb; + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (!param) { + ham_trace(("parameter 'param' must not be NULL")); + return HAM_INV_PARAMETER; + } + + ScopedLock lock(db->get_env()->mutex()); + + /* get the parameters */ + return (db->set_error(db->get_parameters(param))); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_get_error(ham_db_t *hdb) +{ + Database *db = (Database *)hdb; + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (0); + } + + ScopedLock lock; + if (db->get_env()) + lock = ScopedLock(db->get_env()->mutex()); + + return (db->get_error()); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_set_compare_func(ham_db_t *hdb, ham_compare_func_t foo) +{ + Database *db = (Database *)hdb; + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!foo) { + ham_trace(("function pointer must not be NULL")); + return (HAM_INV_PARAMETER); + } + + LocalDatabase *ldb = dynamic_cast<LocalDatabase *>(db); + if (!ldb) { + ham_trace(("operation not possible for remote databases")); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(ldb->get_env()->mutex()); + + /* set the compare functions */ + return (ldb->set_error(ldb->set_compare_func(foo))); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_find(ham_db_t *hdb, ham_txn_t *htxn, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + Environment *env; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + env = db->get_env(); + + ScopedLock lock(env->mutex()); + + if (!key) { + ham_trace(("parameter 'key' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (!record) { + ham_trace(("parameter 'record' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_PREPEND) { + ham_trace(("flag HAM_HINT_PREPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_APPEND) { + ham_trace(("flag HAM_HINT_APPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DIRECT_ACCESS) + && !(env->get_flags() & HAM_IN_MEMORY)) { + ham_trace(("flag HAM_DIRECT_ACCESS is only allowed in " + "In-Memory Databases")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DIRECT_ACCESS) + && (env->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_DIRECT_ACCESS is not allowed in " + "combination with Transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) + && (db->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_PARTIAL is not allowed in combination with " + "transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + /* record number: make sure that we have a valid key structure */ + if ((db->get_flags() & HAM_RECORD_NUMBER32) && !key->data) { + ham_trace(("key->data must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((db->get_flags() & HAM_RECORD_NUMBER64) && !key->data) { + ham_trace(("key->data must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + if (!__prepare_key(key) || !__prepare_record(record)) + return (db->set_error(HAM_INV_PARAMETER)); + + return (db->set_error(db->find(0, txn, key, record, flags))); +} + +HAM_EXPORT int HAM_CALLCONV +ham_key_get_approximate_match_type(ham_key_t *key) +{ + if (key && (ham_key_get_intflags(key) & BtreeKey::kApproximate)) { + int rv = (ham_key_get_intflags(key) & BtreeKey::kLower) ? -1 : +1; + return (rv); + } + + return (0); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_insert(ham_db_t *hdb, ham_txn_t *htxn, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + Environment *env; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return HAM_INV_PARAMETER; + } + env = db->get_env(); + + ScopedLock lock; + if (!(flags & HAM_DONT_LOCK)) + lock = ScopedLock(env->mutex()); + + if (!key) { + ham_trace(("parameter 'key' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (!record) { + ham_trace(("parameter 'record' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_APPEND) { + ham_trace(("flags HAM_HINT_APPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_PREPEND) { + ham_trace(("flags HAM_HINT_PREPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (db->get_flags() & HAM_READ_ONLY) { + ham_trace(("cannot insert in a read-only database")); + return (db->set_error(HAM_WRITE_PROTECTED)); + } + if ((flags & HAM_OVERWRITE) && (flags & HAM_DUPLICATE)) { + ham_trace(("cannot combine HAM_OVERWRITE and HAM_DUPLICATE")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) + && (db->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_PARTIAL is not allowed in combination with " + "transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) && (record->size <= sizeof(uint64_t))) { + ham_trace(("flag HAM_PARTIAL is not allowed if record->size " + "<= 8")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) + && (record->partial_size + record->partial_offset > record->size)) { + ham_trace(("partial offset+size is greater than the total " + "record size")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DUPLICATE) + && !(db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS)) { + ham_trace(("database does not support duplicate keys " + "(see HAM_ENABLE_DUPLICATE_KEYS)")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DUPLICATE_INSERT_AFTER) + || (flags & HAM_DUPLICATE_INSERT_BEFORE) + || (flags & HAM_DUPLICATE_INSERT_LAST) + || (flags & HAM_DUPLICATE_INSERT_FIRST)) { + ham_trace(("function does not support flags HAM_DUPLICATE_INSERT_*; " + "see ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + if (!__prepare_key(key) || !__prepare_record(record)) + return (db->set_error(HAM_INV_PARAMETER)); + + /* allocate temp. storage for a recno key */ + if ((db->get_flags() & HAM_RECORD_NUMBER32) + || (db->get_flags() & HAM_RECORD_NUMBER64)) { + if (flags & HAM_OVERWRITE) { + if (!key->data) { + ham_trace(("key->data must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + } + else { + if (key->flags & HAM_KEY_USER_ALLOC) { + if (!key->data) { + ham_trace(("key->data must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + } + else { + if (key->data || key->size) { + ham_trace(("key->size must be 0, key->data must be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + } + } + } + + return (db->set_error(db->insert(0, txn, key, record, flags))); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_erase(ham_db_t *hdb, ham_txn_t *htxn, ham_key_t *key, uint32_t flags) +{ + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + Environment *env; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + env = db->get_env(); + + ScopedLock lock; + if (!(flags & HAM_DONT_LOCK)) + lock = ScopedLock(env->mutex()); + + if (!key) { + ham_trace(("parameter 'key' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_PREPEND) { + ham_trace(("flag HAM_HINT_PREPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_APPEND) { + ham_trace(("flag HAM_HINT_APPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (db->get_flags() & HAM_READ_ONLY) { + ham_trace(("cannot erase from a read-only database")); + return (HAM_WRITE_PROTECTED); + } + + if (!__prepare_key(key)) + return (db->set_error(HAM_INV_PARAMETER)); + + return (db->set_error(db->erase(0, txn, key, flags))); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_check_integrity(ham_db_t *hdb, uint32_t flags) +{ + Database *db = (Database *)hdb; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if (flags && flags != HAM_PRINT_GRAPH) { + ham_trace(("unknown flag 0x%u", flags)); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + + return (db->set_error(db->check_integrity(flags))); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_db_close(ham_db_t *hdb, uint32_t flags) +{ + Database *db = (Database *)hdb; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + if ((flags & HAM_TXN_AUTO_ABORT) && (flags & HAM_TXN_AUTO_COMMIT)) { + ham_trace(("invalid combination of flags: HAM_TXN_AUTO_ABORT + " + "HAM_TXN_AUTO_COMMIT")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + Environment *env = db->get_env(); + + /* it's ok to close an uninitialized Database */ + if (!env) { + delete db; + return (0); + } + + return (env->close_db(db, flags)); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_create(ham_cursor_t **hcursor, ham_db_t *hdb, ham_txn_t *htxn, + uint32_t flags) +{ + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + Environment *env; + Cursor **cursor = 0; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + cursor = (Cursor **)hcursor; + env = db->get_env(); + + ScopedLock lock; + if (!(flags & HAM_DONT_LOCK)) + lock = ScopedLock(env->mutex()); + + return (db->set_error(db->cursor_create(cursor, txn, flags))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_clone(ham_cursor_t *hsrc, ham_cursor_t **hdest) +{ + Database *db; + + if (!hsrc) { + ham_trace(("parameter 'src' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!hdest) { + ham_trace(("parameter 'dest' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *src, **dest; + src = (Cursor *)hsrc; + dest = (Cursor **)hdest; + + db = src->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + return (db->set_error(db->cursor_clone(dest, src))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_overwrite(ham_cursor_t *hcursor, ham_record_t *record, + uint32_t flags) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if (flags) { + ham_trace(("function does not support a non-zero flags value; " + "see ham_cursor_insert for an alternative then")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (!record) { + ham_trace(("parameter 'record' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (!__prepare_record(record)) + return (db->set_error(HAM_INV_PARAMETER)); + if (db->get_flags() & HAM_READ_ONLY) { + ham_trace(("cannot overwrite in a read-only database")); + return (db->set_error(HAM_WRITE_PROTECTED)); + } + + return (db->set_error(db->cursor_overwrite(cursor, record, flags))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_move(ham_cursor_t *hcursor, ham_key_t *key, + ham_record_t *record, uint32_t flags) +{ + Database *db; + Environment *env; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if ((flags & HAM_ONLY_DUPLICATES) && (flags & HAM_SKIP_DUPLICATES)) { + ham_trace(("combination of HAM_ONLY_DUPLICATES and " + "HAM_SKIP_DUPLICATES not allowed")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + env = db->get_env(); + + if ((flags & HAM_DIRECT_ACCESS) + && !(env->get_flags() & HAM_IN_MEMORY)) { + ham_trace(("flag HAM_DIRECT_ACCESS is only allowed in " + "In-Memory Databases")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DIRECT_ACCESS) + && (env->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_DIRECT_ACCESS is not allowed in " + "combination with Transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) + && (db->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_PARTIAL is not allowed in combination with " + "transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + if (key && !__prepare_key(key)) + return (db->set_error(HAM_INV_PARAMETER)); + if (record && !__prepare_record(record)) + return (db->set_error(HAM_INV_PARAMETER)); + + return (db->set_error(db->cursor_move(cursor, key, record, flags))); +} + +HAM_EXPORT ham_status_t HAM_CALLCONV +ham_cursor_find(ham_cursor_t *hcursor, ham_key_t *key, ham_record_t *record, + uint32_t flags) +{ + Database *db; + Environment *env; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + env = db->get_env(); + + ScopedLock lock; + if (!(flags & HAM_DONT_LOCK)) + lock = ScopedLock(env->mutex()); + + if (!key) { + ham_trace(("parameter 'key' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DIRECT_ACCESS) + && !(env->get_flags() & HAM_IN_MEMORY)) { + ham_trace(("flag HAM_DIRECT_ACCESS is only allowed in " + "In-Memory Databases")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DIRECT_ACCESS) + && (env->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_DIRECT_ACCESS is not allowed in " + "combination with Transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_PREPEND) { + ham_trace(("flag HAM_HINT_PREPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_APPEND) { + ham_trace(("flag HAM_HINT_APPEND is only allowed in " + "ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) + && (db->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_PARTIAL is not allowed in combination with " + "transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + if (key && !__prepare_key(key)) + return (db->set_error(HAM_INV_PARAMETER)); + if (record && !__prepare_record(record)) + return (db->set_error(HAM_INV_PARAMETER)); + + return (db->set_error(db->find(cursor, cursor->get_txn(), + key, record, flags))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_insert(ham_cursor_t *hcursor, ham_key_t *key, ham_record_t *record, + uint32_t flags) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if (!key) { + ham_trace(("parameter 'key' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (!record) { + ham_trace(("parameter 'record' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags&HAM_HINT_APPEND) && (flags&HAM_HINT_PREPEND)) { + ham_trace(("flags HAM_HINT_APPEND and HAM_HINT_PREPEND " + "are mutually exclusive")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (!__prepare_key(key) || !__prepare_record(record)) + return (db->set_error(HAM_INV_PARAMETER)); + + if (db->get_flags() & HAM_READ_ONLY) { + ham_trace(("cannot insert to a read-only database")); + return (db->set_error(HAM_WRITE_PROTECTED)); + } + if ((flags & HAM_DUPLICATE) && (flags & HAM_OVERWRITE)) { + ham_trace(("cannot combine HAM_DUPLICATE and HAM_OVERWRITE")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_DUPLICATE) + && !(db->get_flags() & HAM_ENABLE_DUPLICATE_KEYS)) { + ham_trace(("database does not support duplicate keys " + "(see HAM_ENABLE_DUPLICATE_KEYS)")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) + && (db->get_flags() & HAM_ENABLE_TRANSACTIONS)) { + ham_trace(("flag HAM_PARTIAL is not allowed in combination with " + "transactions")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags&HAM_PARTIAL) + && (record->partial_size + record->partial_offset > record->size)) { + ham_trace(("partial offset+size is greater than the total " + "record size")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if ((flags & HAM_PARTIAL) && (record->size <= sizeof(uint64_t))) { + ham_trace(("flag HAM_PARTIAL is not allowed if record->size <= 8")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + /* + * set flag HAM_DUPLICATE if one of DUPLICATE_INSERT* is set, but do + * not allow these flags if duplicate sorting is enabled + */ + if (flags & (HAM_DUPLICATE_INSERT_AFTER + | HAM_DUPLICATE_INSERT_BEFORE + | HAM_DUPLICATE_INSERT_LAST + | HAM_DUPLICATE_INSERT_FIRST)) { + flags |= HAM_DUPLICATE; + } + + /* allocate temp. storage for a recno key */ + if ((db->get_flags() & HAM_RECORD_NUMBER32) + || (db->get_flags() & HAM_RECORD_NUMBER64)) { + if (flags & HAM_OVERWRITE) { + if (!key->data) { + ham_trace(("key->data must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + } + else { + if (key->flags & HAM_KEY_USER_ALLOC) { + if (!key->data) { + ham_trace(("key->data must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + } + else { + if (key->data || key->size) { + ham_trace(("key->size must be 0, key->data must be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + } + } + } + + return (db->set_error(db->insert(cursor, cursor->get_txn(), key, + record, flags))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_erase(ham_cursor_t *hcursor, uint32_t flags) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if (db->get_flags() & HAM_READ_ONLY) { + ham_trace(("cannot erase from a read-only database")); + return (db->set_error(HAM_WRITE_PROTECTED)); + } + if (flags & HAM_HINT_PREPEND) { + ham_trace(("flags HAM_HINT_PREPEND only allowed in ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + if (flags & HAM_HINT_APPEND) { + ham_trace(("flags HAM_HINT_APPEND only allowed in ham_cursor_insert")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + return (db->set_error(db->erase(cursor, cursor->get_txn(), 0, flags))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_get_duplicate_count(ham_cursor_t *hcursor, uint32_t *count, + uint32_t flags) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if (!count) { + ham_trace(("parameter 'count' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + return (db->set_error(db->cursor_get_record_count(cursor, flags, count))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_get_duplicate_position(ham_cursor_t *hcursor, uint32_t *position) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if (!position) { + ham_trace(("parameter 'position' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + return (db->set_error(db->cursor_get_duplicate_position(cursor, position))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_get_record_size(ham_cursor_t *hcursor, uint64_t *size) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + if (!size) { + ham_trace(("parameter 'size' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + return (db->set_error(db->cursor_get_record_size(cursor, size))); +} + +ham_status_t HAM_CALLCONV +ham_cursor_close(ham_cursor_t *hcursor) +{ + Database *db; + + if (!hcursor) { + ham_trace(("parameter 'cursor' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Cursor *cursor = (Cursor *)hcursor; + + db = cursor->get_db(); + + ScopedLock lock(db->get_env()->mutex()); + + return (db->set_error(db->cursor_close(cursor))); +} + +void HAM_CALLCONV +ham_set_context_data(ham_db_t *hdb, void *data) +{ + Database *db = (Database *)hdb; + + if (!db) + return; + + ScopedLock lock(db->get_env()->mutex()); + db->set_context_data(data); +} + +void * HAM_CALLCONV +ham_get_context_data(ham_db_t *hdb, ham_bool_t dont_lock) +{ + Database *db = (Database *)hdb; + if (!db) + return (0); + + if (dont_lock) + return (db->get_context_data()); + + ScopedLock lock(db->get_env()->mutex()); + return (db->get_context_data()); +} + +ham_db_t * HAM_CALLCONV +ham_cursor_get_database(ham_cursor_t *hcursor) +{ + if (hcursor) { + Cursor *cursor = (Cursor *)hcursor; + return ((ham_db_t *)cursor->get_db()); + } + return (0); +} + +ham_env_t * HAM_CALLCONV +ham_db_get_env(ham_db_t *hdb) +{ + Database *db = (Database *)hdb; + if (!db) + return (0); + + return ((ham_env_t *)db->get_env()); +} + +ham_status_t HAM_CALLCONV +ham_db_get_key_count(ham_db_t *hdb, ham_txn_t *htxn, uint32_t flags, + uint64_t *keycount) +{ + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + + if (!db) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (flags & ~(HAM_SKIP_DUPLICATES)) { + ham_trace(("parameter 'flag' contains unsupported flag bits: %08x", + flags & (~HAM_SKIP_DUPLICATES))); + return (HAM_INV_PARAMETER); + } + if (!keycount) { + ham_trace(("parameter 'keycount' must not be NULL")); + return (db->set_error(HAM_INV_PARAMETER)); + } + + ScopedLock lock(db->get_env()->mutex()); + + return (db->set_error(db->count(txn, (flags & HAM_SKIP_DUPLICATES) != 0, + keycount))); +} + +void HAM_CALLCONV +ham_set_errhandler(ham_errhandler_fun f) +{ + if (f) + hamsterdb::Globals::ms_error_handler = f; + else + hamsterdb::Globals::ms_error_handler = hamsterdb::default_errhandler; +} + +ham_status_t HAM_CALLCONV +ham_env_get_metrics(ham_env_t *henv, ham_env_metrics_t *metrics) +{ + Environment *env = (Environment *)henv; + if (!env) { + ham_trace(("parameter 'env' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!metrics) { + ham_trace(("parameter 'metrics' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + memset(metrics, 0, sizeof(ham_env_metrics_t)); + metrics->version = HAM_METRICS_VERSION; + + // fill in memory metrics + Memory::get_global_metrics(metrics); + // ... and everything else + return (env->fill_metrics(metrics)); +} + +ham_bool_t HAM_CALLCONV +ham_is_debug() +{ +#ifdef HAM_DEBUG + return (HAM_TRUE); +#else + return (HAM_FALSE); +#endif +} + +ham_bool_t HAM_CALLCONV +ham_is_pro() +{ + return (HAM_FALSE); +} + +uint32_t HAM_CALLCONV +ham_is_pro_evaluation() +{ + return (0); +} diff --git a/plugins/Dbx_kv/src/hamsterdb/src/5hamsterdb/hola.cc b/plugins/Dbx_kv/src/hamsterdb/src/5hamsterdb/hola.cc new file mode 100644 index 0000000000..a5a56a1814 --- /dev/null +++ b/plugins/Dbx_kv/src/hamsterdb/src/5hamsterdb/hola.cc @@ -0,0 +1,704 @@ +/* + * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "0root/root.h" + +#include "ham/hamsterdb_ola.h" + +// Always verify that a file of level N does not include headers > N! +#include "1base/error.h" +#include "3btree/btree_visitor.h" +#include "4db/db.h" +#include "4db/db_local.h" + +#ifndef HAM_ROOT_H +# error "root.h was not included" +#endif + +using namespace hamsterdb; + +ham_status_t HAM_CALLCONV +hola_count(ham_db_t *hdb, ham_txn_t *htxn, hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + + result->type = HAM_TYPE_UINT64; + result->u.result_u64 = 0; + + ScopedLock lock(db->get_env()->mutex()); + return (db->set_error(db->count(txn, false, &result->u.result_u64))); +} + +// +// A ScanVisitor for hola_count_if +// +template<typename PodType> +struct CountIfScanVisitor : public ScanVisitor { + CountIfScanVisitor(hola_bool_predicate_t *pred) + : m_count(0), m_pred(pred) { + } + + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) { + if (m_pred->predicate_func(key_data, key_size, m_pred->context)) + m_count++; + } + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) { + const PodType *p = (const PodType *)key_array; + const PodType *end = &p[key_count]; + for (; p < end; p++) { + if (m_pred->predicate_func(p, sizeof(PodType), m_pred->context)) + m_count++; + } + } + + // Assigns the result to |result| + virtual void assign_result(hola_result_t *result) { + memcpy(&result->u.result_u64, &m_count, sizeof(uint64_t)); + } + + // The counter + uint64_t m_count; + + // The user's predicate + hola_bool_predicate_t *m_pred; +}; + +// +// A ScanVisitor for hola_count_if on binary keys +// +struct CountIfScanVisitorBinary : public ScanVisitor { + CountIfScanVisitorBinary(size_t key_size, hola_bool_predicate_t *pred) + : m_count(0), m_key_size(key_size), m_pred(pred) { + } + + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) { + if (m_pred->predicate_func(key_data, key_size, m_pred->context)) + m_count++; + } + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) { + assert(m_key_size != HAM_KEY_SIZE_UNLIMITED); + const uint8_t *p = (const uint8_t *)key_array; + const uint8_t *end = &p[key_count * m_key_size]; + for (; p < end; p += m_key_size) { + if (m_pred->predicate_func(p, m_key_size, m_pred->context)) + m_count++; + } + } + + // Assigns the result to |result| + virtual void assign_result(hola_result_t *result) { + memcpy(&result->u.result_u64, &m_count, sizeof(uint64_t)); + } + + // The counter + uint64_t m_count; + + // The key size + uint16_t m_key_size; + + // The user's predicate + hola_bool_predicate_t *m_pred; +}; + +ham_status_t HAM_CALLCONV +hola_count_if(ham_db_t *hdb, ham_txn_t *txn, hola_bool_predicate_t *pred, + hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!pred) { + ham_trace(("parameter 'pred' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + // Remote databases are not yet supported + LocalDatabase *db = dynamic_cast<LocalDatabase *>((Database *)hdb); + if (!db) { + ham_trace(("hola_* functions are not yet supported for remote databases")); + return (HAM_INV_PARAMETER); + } + + std::auto_ptr<ScanVisitor> visitor; + result->u.result_u64 = 0; + result->type = HAM_TYPE_UINT64; + + switch (db->config().key_type) { + case HAM_TYPE_UINT8: + visitor.reset(new CountIfScanVisitor<uint8_t>(pred)); + break; + case HAM_TYPE_UINT16: + visitor.reset(new CountIfScanVisitor<uint16_t>(pred)); + break; + case HAM_TYPE_UINT32: + visitor.reset(new CountIfScanVisitor<uint32_t>(pred)); + break; + case HAM_TYPE_UINT64: + visitor.reset(new CountIfScanVisitor<uint64_t>(pred)); + break; + case HAM_TYPE_REAL32: + visitor.reset(new CountIfScanVisitor<float>(pred)); + break; + case HAM_TYPE_REAL64: + visitor.reset(new CountIfScanVisitor<double>(pred)); + break; + case HAM_TYPE_BINARY: + visitor.reset(new CountIfScanVisitorBinary(db->config().key_size, + pred)); + break; + default: + ham_assert(!"shouldn't be here"); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + ham_status_t st = db->scan((Transaction *)txn, visitor.get(), false); + if (st == 0) + visitor->assign_result(result); + return (db->set_error(st)); +} + +ham_status_t HAM_CALLCONV +hola_count_distinct(ham_db_t *hdb, ham_txn_t *htxn, hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + Database *db = (Database *)hdb; + Transaction *txn = (Transaction *)htxn; + + result->type = HAM_TYPE_UINT64; + result->u.result_u64 = 0; + + ScopedLock lock(db->get_env()->mutex()); + return (db->set_error(db->count(txn, true, &result->u.result_u64))); +} + +ham_status_t HAM_CALLCONV +hola_count_distinct_if(ham_db_t *hdb, ham_txn_t *txn, + hola_bool_predicate_t *pred, hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!pred) { + ham_trace(("parameter 'pred' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + // Remote databases are not yet supported + LocalDatabase *db = dynamic_cast<LocalDatabase *>((Database *)hdb); + if (!db) { + ham_trace(("hola_* functions are not yet supported for remote databases")); + return (HAM_INV_PARAMETER); + } + + std::auto_ptr<ScanVisitor> visitor; + result->u.result_u64 = 0; + result->type = HAM_TYPE_UINT64; + + switch (db->config().key_type) { + case HAM_TYPE_UINT8: + visitor.reset(new CountIfScanVisitor<uint8_t>(pred)); + break; + case HAM_TYPE_UINT16: + visitor.reset(new CountIfScanVisitor<uint16_t>(pred)); + break; + case HAM_TYPE_UINT32: + visitor.reset(new CountIfScanVisitor<uint32_t>(pred)); + break; + case HAM_TYPE_UINT64: + visitor.reset(new CountIfScanVisitor<uint64_t>(pred)); + break; + case HAM_TYPE_REAL32: + visitor.reset(new CountIfScanVisitor<float>(pred)); + break; + case HAM_TYPE_REAL64: + visitor.reset(new CountIfScanVisitor<double>(pred)); + break; + case HAM_TYPE_BINARY: + visitor.reset(new CountIfScanVisitorBinary(db->config().key_size, + pred)); + break; + default: + ham_assert(!"shouldn't be here"); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + ham_status_t st = db->scan((Transaction *)txn, visitor.get(), true); + if (st == 0) + visitor->assign_result(result); + return (db->set_error(st)); +} + +// +// A ScanVisitor for hola_average +// +template<typename PodType, typename ResultType> +struct AverageScanVisitor : public ScanVisitor { + AverageScanVisitor() + : m_sum(0), m_count(0) { + } + + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) { + ham_assert(key_size == sizeof(PodType)); + + m_sum += *(const PodType *)key_data * duplicate_count; + m_count++; + } + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) { + const PodType *p = (const PodType *)key_array; + const PodType *end = &p[key_count]; + for (; p < end; p++) + m_sum += *p; + m_count += key_count; + } + + // Assigns the result to |result| + virtual void assign_result(hola_result_t *result) { + ResultType res = m_sum / m_count; + memcpy(&result->u.result_u64, &res, sizeof(uint64_t)); + } + + // The sum of all keys + ResultType m_sum; + + // For counting the keys + uint64_t m_count; +}; + +ham_status_t HAM_CALLCONV +hola_average(ham_db_t *hdb, ham_txn_t *txn, hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + // Remote databases are not yet supported + LocalDatabase *db = dynamic_cast<LocalDatabase *>((Database *)hdb); + if (!db) { + ham_trace(("hola_* functions are not yet supported for remote databases")); + return (HAM_INV_PARAMETER); + } + + std::auto_ptr<ScanVisitor> visitor; + result->u.result_u64 = 0; + + switch (db->config().key_type) { + case HAM_TYPE_UINT8: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageScanVisitor<uint8_t, uint64_t>()); + break; + case HAM_TYPE_UINT16: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageScanVisitor<uint16_t, uint64_t>()); + break; + case HAM_TYPE_UINT32: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageScanVisitor<uint32_t, uint64_t>()); + break; + case HAM_TYPE_UINT64: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageScanVisitor<uint64_t, uint64_t>()); + break; + case HAM_TYPE_REAL32: + result->type = HAM_TYPE_REAL64; + visitor.reset(new AverageScanVisitor<float, double>()); + break; + case HAM_TYPE_REAL64: + result->type = HAM_TYPE_REAL64; + visitor.reset(new AverageScanVisitor<double, double>()); + break; + default: + ham_trace(("hola_avg* can only be applied to numerical data")); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + ham_status_t st = db->scan((Transaction *)txn, visitor.get(), false); + if (st == 0) + visitor->assign_result(result); + return (db->set_error(st)); +} + +// +// A ScanVisitor for hola_average_if +// +template<typename PodType, typename ResultType> +struct AverageIfScanVisitor : public ScanVisitor { + AverageIfScanVisitor(hola_bool_predicate_t *pred) + : m_sum(0), m_count(0), m_pred(pred) { + } + + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) { + ham_assert(key_size == sizeof(PodType)); + + if (m_pred->predicate_func(key_data, key_size, m_pred->context)) { + m_sum += *(const PodType *)key_data * duplicate_count; + m_count++; + } + } + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) { + const PodType *p = (const PodType *)key_array; + const PodType *end = &p[key_count]; + for (; p < end; p++) { + if (m_pred->predicate_func(p, sizeof(PodType), m_pred->context)) { + m_sum += *p; + m_count++; + } + } + } + + // Assigns the result to |result| + virtual void assign_result(hola_result_t *result) { + ResultType res = m_sum / m_count; + memcpy(&result->u.result_u64, &res, sizeof(uint64_t)); + } + + // The sum of all keys + ResultType m_sum; + + // For counting the keys + uint64_t m_count; + + // The user's predicate function + hola_bool_predicate_t *m_pred; +}; + +ham_status_t HAM_CALLCONV +hola_average_if(ham_db_t *hdb, ham_txn_t *txn, hola_bool_predicate_t *pred, + hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!pred) { + ham_trace(("parameter 'pred' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + // Remote databases are not yet supported + LocalDatabase *db = dynamic_cast<LocalDatabase *>((Database *)hdb); + if (!db) { + ham_trace(("hola_* functions are not yet supported for remote databases")); + return (HAM_INV_PARAMETER); + } + + std::auto_ptr<ScanVisitor> visitor; + result->u.result_u64 = 0; + + switch (db->config().key_type) { + case HAM_TYPE_UINT8: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageIfScanVisitor<uint8_t, uint64_t>(pred)); + break; + case HAM_TYPE_UINT16: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageIfScanVisitor<uint16_t, uint64_t>(pred)); + break; + case HAM_TYPE_UINT32: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageIfScanVisitor<uint32_t, uint64_t>(pred)); + break; + case HAM_TYPE_UINT64: + result->type = HAM_TYPE_UINT64; + visitor.reset(new AverageIfScanVisitor<uint64_t, uint64_t>(pred)); + break; + case HAM_TYPE_REAL32: + result->type = HAM_TYPE_REAL64; + visitor.reset(new AverageIfScanVisitor<float, double>(pred)); + break; + case HAM_TYPE_REAL64: + result->type = HAM_TYPE_REAL64; + visitor.reset(new AverageIfScanVisitor<double, double>(pred)); + break; + default: + ham_trace(("hola_avg* can only be applied to numerical data")); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + ham_status_t st = db->scan((Transaction *)txn, visitor.get(), false); + if (st == 0) + visitor->assign_result(result); + return (db->set_error(st)); +} + +// +// A ScanVisitor for hola_sum +// +template<typename PodType, typename ResultType> +struct SumScanVisitor : public ScanVisitor { + SumScanVisitor() + : m_sum(0) { + } + + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) { + ham_assert(key_size == sizeof(PodType)); + m_sum += *(const PodType *)key_data * duplicate_count; + } + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) { + const PodType *p = (const PodType *)key_array; + const PodType *end = &p[key_count]; + const int kMax = 8; + ResultType sums[kMax] = {0}; + for (; p + kMax < end; p += kMax) { +#if defined __GNUC__ + __builtin_prefetch(((char *)p) + kMax * sizeof(PodType)); +#endif + sums[0] += p[0]; + sums[1] += p[1]; + sums[2] += p[2]; + sums[3] += p[3]; + sums[4] += p[4]; + sums[5] += p[5]; + sums[6] += p[6]; + sums[7] += p[7]; + } + for (; p < end; p++) + m_sum += *p; + for (int i = 0; i < kMax; i++) + m_sum += sums[i]; + } + + // Assigns the result to |result| + virtual void assign_result(hola_result_t *result) { + memcpy(&result->u.result_u64, &m_sum, sizeof(uint64_t)); + } + + // The sum of all keys + ResultType m_sum; +}; + +ham_status_t HAM_CALLCONV +hola_sum(ham_db_t *hdb, ham_txn_t *txn, hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'hdb' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + std::auto_ptr<ScanVisitor> visitor; + result->u.result_u64 = 0; + + // Remote databases are not yet supported + LocalDatabase *db = dynamic_cast<LocalDatabase *>((Database *)hdb); + if (!db) { + ham_trace(("hola_* functions are not yet supported for remote databases")); + return (HAM_INV_PARAMETER); + } + + switch (db->config().key_type) { + case HAM_TYPE_UINT8: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumScanVisitor<uint8_t, uint64_t>()); + break; + case HAM_TYPE_UINT16: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumScanVisitor<uint16_t, uint64_t>()); + break; + case HAM_TYPE_UINT32: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumScanVisitor<uint32_t, uint64_t>()); + break; + case HAM_TYPE_UINT64: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumScanVisitor<uint64_t, uint64_t>()); + break; + case HAM_TYPE_REAL32: + result->type = HAM_TYPE_REAL64; + visitor.reset(new SumScanVisitor<float, double>()); + break; + case HAM_TYPE_REAL64: + result->type = HAM_TYPE_REAL64; + visitor.reset(new SumScanVisitor<double, double>()); + break; + default: + ham_trace(("hola_sum* can only be applied to numerical data")); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + ham_status_t st = db->scan((Transaction *)txn, visitor.get(), false); + if (st == 0) + visitor->assign_result(result); + return (db->set_error(st)); +} + +// +// A ScanVisitor for hola_sum_if +// +template<typename PodType, typename ResultType> +struct SumIfScanVisitor : public ScanVisitor { + SumIfScanVisitor(hola_bool_predicate_t *pred) + : m_sum(0), m_pred(pred) { + } + + // Operates on a single key + virtual void operator()(const void *key_data, uint16_t key_size, + size_t duplicate_count) { + ham_assert(key_size == sizeof(PodType)); + + if (m_pred->predicate_func(key_data, key_size, m_pred->context)) + m_sum += *(const PodType *)key_data * duplicate_count; + } + + // Operates on an array of keys + virtual void operator()(const void *key_array, size_t key_count) { + const PodType *p = (const PodType *)key_array; + const PodType *end = &p[key_count]; + for (; p < end; p++) { + if (m_pred->predicate_func(p, sizeof(PodType), m_pred->context)) + m_sum += *p; + } + } + + // Assigns the result to |result| + virtual void assign_result(hola_result_t *result) { + memcpy(&result->u.result_u64, &m_sum, sizeof(uint64_t)); + } + + // The sum of all keys + ResultType m_sum; + + // The user's predicate function + hola_bool_predicate_t *m_pred; +}; + +ham_status_t HAM_CALLCONV +hola_sum_if(ham_db_t *hdb, ham_txn_t *txn, hola_bool_predicate_t *pred, + hola_result_t *result) +{ + if (!hdb) { + ham_trace(("parameter 'db' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!pred) { + ham_trace(("parameter 'pred' must not be NULL")); + return (HAM_INV_PARAMETER); + } + if (!result) { + ham_trace(("parameter 'result' must not be NULL")); + return (HAM_INV_PARAMETER); + } + + // Remote databases are not yet supported + LocalDatabase *db = dynamic_cast<LocalDatabase *>((Database *)hdb); + if (!db) { + ham_trace(("hola_* functions are not yet supported for remote databases")); + return (HAM_INV_PARAMETER); + } + + std::auto_ptr<ScanVisitor> visitor; + result->u.result_u64 = 0; + + switch (db->config().key_type) { + case HAM_TYPE_UINT8: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumIfScanVisitor<uint8_t, uint64_t>(pred)); + break; + case HAM_TYPE_UINT16: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumIfScanVisitor<uint16_t, uint64_t>(pred)); + break; + case HAM_TYPE_UINT32: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumIfScanVisitor<uint32_t, uint64_t>(pred)); + break; + case HAM_TYPE_UINT64: + result->type = HAM_TYPE_UINT64; + visitor.reset(new SumIfScanVisitor<uint64_t, uint64_t>(pred)); + break; + case HAM_TYPE_REAL32: + result->type = HAM_TYPE_REAL64; + visitor.reset(new SumIfScanVisitor<float, double>(pred)); + break; + case HAM_TYPE_REAL64: + result->type = HAM_TYPE_REAL64; + visitor.reset(new SumIfScanVisitor<double, double>(pred)); + break; + default: + ham_trace(("hola_sum* can only be applied to numerical data")); + return (HAM_INV_PARAMETER); + } + + ScopedLock lock(db->get_env()->mutex()); + ham_status_t st = db->scan((Transaction *)txn, visitor.get(), false); + if (st == 0) + visitor->assign_result(result); + return (db->set_error(st)); +} |