diff --git a/Cargo.lock b/Cargo.lock index ad7022e..8ce04bf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,63 +4,64 @@ version = 3 [[package]] name = "anstream" -version = "0.6.13" +version = "0.6.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" [[package]] name = "anstyle-parse" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.0.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "anyhow" -version = "1.0.82" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" [[package]] name = "autocfg" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "base64" @@ -79,24 +80,19 @@ dependencies = [ [[package]] name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.4.2" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" [[package]] name = "cc" -version = "1.0.82" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01" +checksum = "96c51067fd44124faa7f870b4b1c969379ad32b2ba805aa959430ceaa384f695" dependencies = [ "jobserver", "libc", + "once_cell", ] [[package]] @@ -107,9 +103,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.5.4" +version = "4.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" +checksum = "a9689a29b593160de5bc4aacab7b5d54fb52231de70122626c178e6a368994c7" dependencies = [ "clap_builder", "clap_derive", @@ -117,9 +113,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.2" +version = "4.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +checksum = "2e5387378c84f6faa26890ebf9f0a92989f8873d4d380467bcd0d8d8620424df" dependencies = [ "anstream", "anstyle", @@ -129,11 +125,11 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.4" +version = "4.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" +checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn 2.0.66", @@ -141,15 +137,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" [[package]] name = "colorchoice" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" [[package]] name = "concurrent-map" @@ -163,9 +159,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] @@ -179,21 +175,27 @@ dependencies = [ "shared-local-state", ] +[[package]] +name = "either" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" + [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "fastrand" -version = "2.0.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "fault-injection" @@ -211,6 +213,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "gapped_array" +version = "0.1.0" +dependencies = [ + "itertools", + "kdam", + "serde", + "slice_search", +] + [[package]] name = "generational-arena" version = "0.2.9" @@ -222,21 +234,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.10" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", "wasi", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -256,6 +262,21 @@ dependencies = [ "serde", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -264,30 +285,50 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.28" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" +checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" dependencies = [ "libc", ] +[[package]] +name = "kdam" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "526586ea01a9a132b5f8d3a60f6d6b41b411550236f5ee057795f20b37316957" +dependencies = [ + "terminal_size", + "windows-sys 0.52.0", +] + [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "learned_index_segmentation" +version = "0.1.0" +dependencies = [ + "num", + "rand", + "rand_distr", + "serde", +] + [[package]] name = "libc" -version = "0.2.153" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "libm" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "limousine_bench" @@ -308,9 +349,11 @@ version = "0.3.4" dependencies = [ "anyhow", "bincode", + "gapped_array", "generational-arena", "id_allocator", "lazy_static", + "learned_index_segmentation", "marble", "num", "serde", @@ -344,6 +387,7 @@ name = "limousine_tests" version = "0.1.0" dependencies = [ "limousine_engine", + "num", "rand", "rand_distr", "tempfile", @@ -351,15 +395,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -466,6 +510,12 @@ dependencies = [ "libm", ] +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + [[package]] name = "pagetable" version = "0.4.5" @@ -474,9 +524,9 @@ checksum = "92a516a35619e87f5c17e7a5dd0e0313aa01aecbd39b3c650b22a4500d74f6e0" [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -484,22 +534,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", "redox_syscall", "smallvec", - "windows-targets 0.48.1", + "windows-targets 0.52.5", ] [[package]] name = "pkg-config" -version = "0.3.27" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "ppv-lite86" @@ -509,9 +559,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.84" +version = "1.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec96c6a92621310b51366f1e28d05ef11489516e93be030060e5fc12024a49d6" +checksum = "22244ce15aa966053a896d1accb3a6e68469b97c7f33f284b99f0d576879fc23" dependencies = [ "unicode-ident", ] @@ -567,37 +617,37 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "469052894dcb553421e483e4209ee581a45100d31b4018de03e5a7ad86374a7e" dependencies = [ - "bitflags 1.3.2", + "bitflags", ] [[package]] name = "rustix" -version = "0.38.32" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.4.2", + "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] name = "rustversion" -version = "1.0.14" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" +checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" [[package]] name = "ryu" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "scopeguard" @@ -683,11 +733,11 @@ checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" [[package]] name = "strum_macros" -version = "0.26.2" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", "rustversion", @@ -725,7 +775,17 @@ dependencies = [ "cfg-if", "fastrand", "rustix", - "windows-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix", + "windows-sys 0.48.0", ] [[package]] @@ -741,9 +801,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.9" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "utf8parse" @@ -779,128 +839,144 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.5", ] [[package]] name = "windows-targets" -version = "0.48.1" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] name = "windows-targets" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] name = "windows_aarch64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" -version = "0.48.0" +version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.0" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" [[package]] name = "zstd-safe" diff --git a/Cargo.toml b/Cargo.toml index ff4b74e..281663e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,8 +9,9 @@ members = [ "utils/slice_search", "utils/sorted_array", "utils/id_allocator", - - "bench/runner", + "utils/learned_segment", + "utils/gapped_array", + "bench/runner", ] exclude = [ diff --git a/bench/instance/Cargo.lock b/bench/instance/Cargo.lock index a871e17..332584a 100644 --- a/bench/instance/Cargo.lock +++ b/bench/instance/Cargo.lock @@ -37,7 +37,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" dependencies = [ - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -47,7 +47,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -94,6 +94,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + [[package]] name = "byteorder" version = "1.5.0" @@ -214,6 +220,22 @@ dependencies = [ "shared-local-state", ] +[[package]] +name = "either" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" + +[[package]] +name = "errno" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "fault-injection" version = "1.0.10" @@ -251,6 +273,16 @@ dependencies = [ "byteorder", ] +[[package]] +name = "gapped_array" +version = "0.1.0" +dependencies = [ + "itertools", + "kdam", + "serde", + "slice_search", +] + [[package]] name = "generational-arena" version = "0.2.9" @@ -294,6 +326,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.11" @@ -309,12 +350,30 @@ dependencies = [ "libc", ] +[[package]] +name = "kdam" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "526586ea01a9a132b5f8d3a60f6d6b41b411550236f5ee057795f20b37316957" +dependencies = [ + "terminal_size", + "windows-sys 0.52.0", +] + [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "learned_index_segmentation" +version = "0.1.0" +dependencies = [ + "num", + "serde", +] + [[package]] name = "libc" version = "0.2.153" @@ -346,9 +405,11 @@ version = "0.3.4" dependencies = [ "anyhow", "bincode", + "gapped_array", "generational-arena", "id_allocator", "lazy_static", + "learned_index_segmentation", "marble", "num", "serde", @@ -388,6 +449,12 @@ dependencies = [ "sled", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" + [[package]] name = "lock_api" version = "0.4.11" @@ -583,7 +650,7 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "bitflags", + "bitflags 1.3.2", ] [[package]] @@ -592,7 +659,20 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ - "bitflags", + "bitflags 1.3.2", +] + +[[package]] +name = "rustix" +version = "0.38.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +dependencies = [ + "bitflags 2.5.0", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", ] [[package]] @@ -740,6 +820,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "trait-set" version = "0.3.0" @@ -785,6 +875,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/core/Cargo.toml b/core/Cargo.toml index 3bfbcaf..d417a28 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -23,7 +23,10 @@ bincode = { version = "1.3.3" } anyhow = "1.0.82" sorted_array = { path = "../utils/sorted_array", version = "0.1.3", features = ["serde"] } +gapped_array = { path = "../utils/gapped_array", version = "0.1.0" } id_allocator = { path = "../utils/id_allocator", version = "0.1.0", features = ["serde"] } +learned_index_segmentation = { path = "../utils/learned_segment", version = "0.1.0" } + lazy_static = "1.4.0" [dev-dependencies] diff --git a/core/src/classical/disk/boundary_layer.rs b/core/src/classical/btree_disk/boundary_layer.rs similarity index 100% rename from core/src/classical/disk/boundary_layer.rs rename to core/src/classical/btree_disk/boundary_layer.rs diff --git a/core/src/classical/disk/deep_layer.rs b/core/src/classical/btree_disk/deep_layer.rs similarity index 100% rename from core/src/classical/disk/deep_layer.rs rename to core/src/classical/btree_disk/deep_layer.rs diff --git a/core/src/classical/disk/mod.rs b/core/src/classical/btree_disk/mod.rs similarity index 100% rename from core/src/classical/disk/mod.rs rename to core/src/classical/btree_disk/mod.rs diff --git a/core/src/classical/memory/layer.rs b/core/src/classical/btree_memory/layer.rs similarity index 100% rename from core/src/classical/memory/layer.rs rename to core/src/classical/btree_memory/layer.rs diff --git a/core/src/classical/memory/mod.rs b/core/src/classical/btree_memory/mod.rs similarity index 95% rename from core/src/classical/memory/mod.rs rename to core/src/classical/btree_memory/mod.rs index bc7da89..55f149e 100644 --- a/core/src/classical/memory/mod.rs +++ b/core/src/classical/btree_memory/mod.rs @@ -3,7 +3,7 @@ mod layer; use crate::common::list::memory::ArenaID; use crate::node_layer::{impl_node_layer, NodeLayer}; use crate::traits::Address; -use crate::{component::*, Key, StaticBounded, Value}; +use crate::{component::*, Key, Value}; use layer::*; // ------------------------------------------------------- @@ -12,7 +12,7 @@ use layer::*; pub type BTreeInternalAddress = ArenaID; -pub struct BTreeInternalComponent { +pub struct BTreeInternalComponent { inner: MemoryBTreeLayer, _ph: std::marker::PhantomData, } @@ -20,7 +20,7 @@ pub struct BTreeInternalComponent NodeLayer for BTreeInternalComponent where - K: Clone + Ord + StaticBounded, + K: Key, BA: Address, PA: Address, { diff --git a/core/src/classical/mod.rs b/core/src/classical/mod.rs index a008fbd..eca6275 100644 --- a/core/src/classical/mod.rs +++ b/core/src/classical/mod.rs @@ -1,9 +1,9 @@ +pub mod btree_disk; +pub mod btree_memory; pub mod btree_top; -pub mod disk; -pub mod memory; mod node; +pub use btree_disk::*; +pub use btree_memory::*; pub use btree_top::*; -pub use disk::*; -pub use memory::*; diff --git a/core/src/common/list/memory.rs b/core/src/common/list/memory.rs index 9d9fb6f..85739c1 100644 --- a/core/src/common/list/memory.rs +++ b/core/src/common/list/memory.rs @@ -45,10 +45,11 @@ where } } - pub fn insert_after(&mut self, inner: N, ptr: ArenaID) -> ArenaID { + #[must_use] + pub fn insert_after(&mut self, node: N, ptr: ArenaID) -> ArenaID { let next_ptr = self.arena[ptr].0.next; - let mut new_node = MemoryNode::new(inner); + let mut new_node = MemoryNode::new(node); new_node.previous = Some(ptr); new_node.next = next_ptr; @@ -65,10 +66,11 @@ where } #[allow(unused)] - pub fn insert_before(&mut self, inner: N, ptr: ArenaID) -> ArenaID { + #[must_use] + pub fn insert_before(&mut self, node: N, ptr: ArenaID) -> ArenaID { let previous_ptr = self.arena[ptr].0.previous; - let mut new_node = MemoryNode::new(inner); + let mut new_node = MemoryNode::new(node); new_node.previous = previous_ptr; new_node.next = Some(ptr); @@ -84,6 +86,7 @@ where new_node_ptr } + #[must_use] pub fn clear(&mut self) -> ArenaID { self.arena.clear(); let ptr = self diff --git a/core/src/learned/mod.rs b/core/src/learned/mod.rs new file mode 100644 index 0000000..2ebb2c5 --- /dev/null +++ b/core/src/learned/mod.rs @@ -0,0 +1,5 @@ +pub mod pgm_memory; + +mod node; + +pub use pgm_memory::*; diff --git a/core/src/learned/node.rs b/core/src/learned/node.rs new file mode 100644 index 0000000..c829a1b --- /dev/null +++ b/core/src/learned/node.rs @@ -0,0 +1,68 @@ +use learned_index_segmentation::LinearModel; + +use crate::{Key, KeyBounded, StaticBounded}; +use gapped_array::GappedKVArray; + +impl KeyBounded for LinearModel { + fn lower_bound(&self) -> &K { + self.min_key() + } +} + +#[derive(Debug)] +pub struct PGMNode { + gapped: GappedKVArray, + model: LinearModel, +} + +impl KeyBounded for PGMNode { + fn lower_bound(&self) -> &K { + self.gapped.min().unwrap_or(&K::max_ref()) + } +} + +impl Default for PGMNode { + fn default() -> Self { + Self { + gapped: GappedKVArray::new(0), + model: LinearModel::sentinel(), + } + } +} + +impl PGMNode { + pub fn from_trained(model: LinearModel, entries: Vec<(K, V)>) -> Self { + // NOTE: Filling at 0.5 utilization is just a heuristic, eventually this should be a param + let mut gapped = GappedKVArray::new(entries.len() * 2); + for (key, value) in entries { + let hint = model.hint(&key).min(gapped.len() - 1); + gapped + .initial_model_based_insert((key, value), hint) + .unwrap(); + } + Self { gapped, model } + } + + pub fn search_exact(&self, key: &K) -> Option<&V> { + let hint = self.model.hint(key); + self.gapped.search_exact(key, Some(hint)) + } + + pub fn search_pir(&self, key: &K) -> &V { + let hint = self.model.hint(key); + match self.gapped.search_pir(key, Some(hint)) { + Some(val) => val, + None => self.gapped.min_val().unwrap(), + } + } + + pub fn grow_insert(&mut self, entry: (K, V)) { + if self.gapped.density() >= 0.8 { + let scale_factor = 2.0; + self.gapped.rescale(scale_factor).unwrap(); + self.model.rescale(scale_factor as f64); + } + let hint = self.model.hint(&entry.0); + self.gapped.upsert_with_hint(entry, hint).unwrap(); + } +} diff --git a/core/src/learned/pgm_memory/layer.rs b/core/src/learned/pgm_memory/layer.rs new file mode 100644 index 0000000..64bc909 --- /dev/null +++ b/core/src/learned/pgm_memory/layer.rs @@ -0,0 +1,124 @@ +// ---------------------------------------- +// Layer Type +// ---------------------------------------- + +use std::ops::Bound; + +use learned_index_segmentation::linear_simple_segmentation; + +use crate::common::list::memory::*; +use crate::iter::Iter; +use crate::learned::node::PGMNode; +use crate::{impl_node_layer, Address, Key, NodeLayer}; + +pub struct MemoryPGMLayer { + inner: MemoryList, PA>, +} + +struct FillerIter<'a, K, B, SA, PA> +where + SA: Address, + PA: Address, +{ + iter: Iter<'a, K, B, SA, PA>, +} +impl<'a, K, B, SA, PA> Iterator for FillerIter<'a, K, B, SA, PA> +where + K: Clone, + B: NodeLayer, + SA: Address, + PA: Address, +{ + type Item = (K, SA); + + fn next(&mut self) -> Option { + match self.iter.next() { + Some((key, address)) => Some((key, address)), + None => None, + } + } +} + +impl MemoryPGMLayer +where + K: Key, +{ + pub fn empty() -> Self { + Self { + inner: MemoryList::empty(), + } + } + + pub fn fill(&mut self, iter: impl Iterator) { + let trained = linear_simple_segmentation::<_, _, EPSILON>(iter); + + let mut ptr = self.inner.clear(); + + for (model, entries) in trained.into_iter().rev() { + let node = PGMNode::from_trained(model, entries); + ptr = self.inner.insert_before(node, ptr); + } + } + + pub fn fill_will_parent>(&mut self, base: &mut B) + where + V: Address, + { + let iter = base.range(Bound::Unbounded, Bound::Unbounded); + let iter = FillerIter { iter }; + + let trained = linear_simple_segmentation::<_, _, EPSILON>(iter); + + let mut ptr = self.inner.clear(); + + for (model, entries) in trained.into_iter().rev() { + let node = PGMNode::from_trained(model, entries.clone()); + ptr = self.inner.insert_before(node, ptr); + for (_, value) in entries.iter() { + base.set_parent(value.clone(), ptr); + } + } + } + + pub fn insert(&mut self, key: K, value: V, ptr: ArenaID) -> Option<(K, ArenaID, PA)> + where + PA: Address, + { + self.inner[ptr].grow_insert((key, value)); + None + } + + pub fn insert_with_parent>( + &mut self, + key: K, + value: V, + base: &mut B, + ptr: ArenaID, + ) -> Option<(K, ArenaID, PA)> + where + V: Address, + PA: Address, + { + self.inner[ptr].grow_insert((key, value.clone())); + base.set_parent(value, ptr); + None + } +} + +impl core::ops::Index + for MemoryPGMLayer +{ + type Output = PGMNode; + + fn index(&self, index: ArenaID) -> &Self::Output { + &self.inner[index] + } +} + +impl NodeLayer for MemoryPGMLayer +where + K: Key, + PA: Address, +{ + impl_node_layer!(ArenaID, PA); +} diff --git a/core/src/learned/pgm_memory/mod.rs b/core/src/learned/pgm_memory/mod.rs new file mode 100644 index 0000000..7f1db09 --- /dev/null +++ b/core/src/learned/pgm_memory/mod.rs @@ -0,0 +1,129 @@ +use num::PrimInt; + +use crate::{ + common::list::memory::ArenaID, impl_node_layer, Address, BaseComponent, InternalComponent, Key, + NodeLayer, PropagateInsert, StaticBounded, Value, +}; + +use self::layer::MemoryPGMLayer; + +mod layer; + +// ------------------------------------------------------- +// Internal Component +// ------------------------------------------------------- + +pub type PGMInternalAddress = ArenaID; + +pub struct PGMInternalComponent { + inner: MemoryPGMLayer, + _ph: std::marker::PhantomData, +} + +impl NodeLayer + for PGMInternalComponent +where + K: Clone + Ord + StaticBounded + PrimInt, + BA: Address, + PA: Address, +{ + impl_node_layer!(ArenaID, PA); +} + +impl, const EPSILON: usize> + InternalComponent + for PGMInternalComponent +where + K: Key + PrimInt, + BA: Address, + PA: Address, +{ + fn search(&self, _: &B, ptr: PGMInternalAddress, key: &K) -> BA { + let node = &self.inner[ptr]; + node.search_pir(key).clone() + } + + fn insert( + &mut self, + base: &mut B, + prop: crate::PropagateInsert, + ) -> Option> { + match prop { + PropagateInsert::Single(key, address, ptr) => { + let result = self.inner.insert_with_parent(key, address, base, ptr); + result.map(|(key, address, parent)| PropagateInsert::Single(key, address, parent)) + } + PropagateInsert::Replace(_, _) => { + unimplemented!() + } + } + } + + fn build(base: &mut B) -> Self { + let mut result = MemoryPGMLayer::empty(); + result.fill_will_parent(base); + + Self { + inner: result, + _ph: std::marker::PhantomData, + } + } +} + +// ------------------------------------------------------- +// Base Component +// ------------------------------------------------------- + +pub type PGMBaseAddress = PGMInternalAddress; + +pub struct PGMBaseComponent { + inner: MemoryPGMLayer, +} + +impl NodeLayer + for PGMBaseComponent +where + K: Key + PrimInt, + V: Value, + PA: Address, +{ + impl_node_layer!(ArenaID, PA); +} + +impl BaseComponent + for PGMBaseComponent +where + K: Key + PrimInt, + V: Value, + PA: Address, +{ + fn insert( + &mut self, + ptr: PGMBaseAddress, + key: K, + value: V, + ) -> Option> { + if let Some((key, address, parent)) = self.inner.insert(key, value, ptr) { + Some(PropagateInsert::Single(key, address, parent)) + } else { + None + } + } + + fn search(&self, ptr: PGMBaseAddress, key: &K) -> Option { + self.inner[ptr].search_exact(key).cloned() + } + + fn empty() -> Self { + let result = MemoryPGMLayer::empty(); + + Self { inner: result } + } + + fn build(iter: impl Iterator) -> Self { + let mut result = MemoryPGMLayer::empty(); + result.fill(iter); + + Self { inner: result } + } +} diff --git a/core/src/lib.rs b/core/src/lib.rs index b1dcb85..f07155b 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -2,6 +2,7 @@ pub mod classical; pub mod component; pub mod iter; pub mod kv_store; +pub mod learned; mod common; mod node_layer; @@ -12,6 +13,7 @@ pub use anyhow::Result; pub use classical::*; pub use common::storage::GlobalStore; +pub use learned::*; pub use component::*; pub use kv_store::*; diff --git a/core/src/traits.rs b/core/src/traits.rs index bf1beb2..cb9e367 100644 --- a/core/src/traits.rs +++ b/core/src/traits.rs @@ -1,4 +1,5 @@ use lazy_static::lazy_static; +use num::PrimInt; use serde::{Deserialize, Serialize}; use trait_set::trait_set; @@ -11,7 +12,7 @@ trait_set! { pub trait Persisted = Serialize + for<'de> Deserialize<'de> + Clone + Default + Eq + 'static; /// General key type - pub trait Key = Clone + StaticBounded + 'static; + pub trait Key = PrimInt + Clone + StaticBounded + 'static ; /// General value type pub trait Value = Clone + 'static; @@ -23,6 +24,8 @@ pub trait KeyBounded { pub trait StaticBounded: Ord + 'static { fn min_ref() -> &'static Self; + + fn max_ref() -> &'static Self; } macro_rules! impl_integer { @@ -33,6 +36,11 @@ macro_rules! impl_integer { static MIN: $t = <$t>::min_value(); &MIN } + + fn max_ref() -> &'static Self { + static MAX: $t = <$t>::max_value(); + &MAX + } } impl KeyBounded<$t> for $t { @@ -48,10 +56,15 @@ impl_integer!(usize, u8, u16, u32, u64, u128, isize, i8, i16, i32, i64, i128); lazy_static! { static ref MIN_STRING: String = "".to_string(); + static ref MAX_STRING: String = "".to_string(); } impl StaticBounded for String { fn min_ref() -> &'static Self { &MIN_STRING } + + fn max_ref() -> &'static Self { + &MAX_STRING + } } diff --git a/derive/src/component.rs b/derive/src/component.rs index dd4b7da..b84fca9 100644 --- a/derive/src/component.rs +++ b/derive/src/component.rs @@ -12,6 +12,7 @@ use syn::{ pub enum Component { BTreeTop, BTree { fanout: usize, persist: bool }, + PGM { epsilon: usize, }, } pub struct ParsedComponent { @@ -61,6 +62,17 @@ impl Parse for ParsedComponent { Component::BTree { fanout, persist } } + "pgm" => { + let epsilon = attributes.try_get_integer(&ident, "epsilon")?; + + let epsilon = if epsilon > 0 { + epsilon as usize + } else { + bail!(ident, "Specified epsilon is not positive"); + }; + + Component::PGM { epsilon } + } _ => { bail!(ident, "Unknown component `{}`!", ident.to_string()); } @@ -102,12 +114,14 @@ pub enum PersistType { #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum InternalComponent { BTree { fanout: usize, persist: PersistType }, + PGM {epsilon: usize}, } impl ToString for InternalComponent { fn to_string(&self) -> String { match self { - Self::BTree { fanout, persist } => format!("{persist:?}BTreeInternal{fanout:?}").to_string() + Self::BTree { fanout, persist } => format!("{persist:?}BTreeInternal{fanout:?}").to_string(), + Self::PGM {epsilon} => format!("PGMInternal{epsilon:?}").to_string(), } } } @@ -145,6 +159,10 @@ impl InternalComponent { fanout, persist: PersistType::DeepDisk, }), + ( + Component::PGM { epsilon }, + _ + ) => Some(Self::PGM { epsilon }), _ => None, } } @@ -172,6 +190,9 @@ impl InternalComponent { persist: PersistType::DeepDisk, } => quote!(DeepDiskBTreeInternalComponent) .to_token_stream(), + + InternalComponent::PGM { epsilon } => + quote!(PGMInternalComponent).to_token_stream(), } } @@ -188,12 +209,17 @@ impl InternalComponent { InternalComponent::BTree { persist: PersistType::DeepDisk, .. } => { quote!(DeepDiskBTreeInternalAddress).to_token_stream() } + + InternalComponent::PGM {..} => { + quote!(PGMInternalAddress).to_token_stream() + } } } pub fn is_persisted(&self) -> bool { match *self { InternalComponent::BTree { persist, .. } => persist != PersistType::InMemory, + InternalComponent::PGM {..} => false, } } } @@ -201,12 +227,14 @@ impl InternalComponent { #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum BaseComponent { BTree { fanout: usize, persist: PersistType }, + PGM {epsilon: usize}, } impl ToString for BaseComponent { fn to_string(&self) -> String { match self { - Self::BTree { fanout, persist } => format!("{persist:?}BTreeBase{fanout:?}").to_string() + Self::BTree { fanout, persist } => format!("{persist:?}BTreeBase{fanout:?}").to_string(), + Self::PGM {epsilon} => format!("PGMBase{epsilon:?}").to_string(), } } } @@ -244,6 +272,7 @@ impl BaseComponent { fanout, persist: PersistType::DeepDisk, }), + (Component::PGM {epsilon}, _) => Some(Self::PGM {epsilon}), _ => None, } } @@ -266,6 +295,10 @@ impl BaseComponent { persist: PersistType::DeepDisk, } => quote!(DeepDiskBTreeBaseComponent) .to_token_stream(), + + BaseComponent::PGM { + epsilon + } => quote!(PGMBaseComponent).to_token_stream(), } } @@ -285,12 +318,17 @@ impl BaseComponent { persist: PersistType::DeepDisk, .. } => quote!(DeepDiskBTreeBaseAddress).to_token_stream(), + + BaseComponent::PGM { + .. + } => quote!(PGMBaseAddress).to_token_stream(), } } pub fn is_persisted(&self) -> bool { match *self { BaseComponent::BTree { persist, .. } => persist != PersistType::InMemory, + BaseComponent::PGM { .. } => false, } } } diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 2dba66b..4035a33 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -10,3 +10,4 @@ limousine_engine = { path = "../engine" } rand = "0.8.5" rand_distr = "0.4.3" tempfile = "3.0" +num = "0.4.0" diff --git a/tests/src/lib.rs b/tests/src/lib.rs index f6f8c08..570bf7e 100644 --- a/tests/src/lib.rs +++ b/tests/src/lib.rs @@ -57,7 +57,7 @@ mod tests { // Test for insert now for key in 0..10_000 { - index.insert(key, key * key as V)?; + index.insert(key, key as V * key as V)?; } // Search again @@ -106,6 +106,39 @@ mod tests { } } + /// Same as test_kv_store, but instead of inserting elements one at a time, + /// the index is built over the numbers + fn test_kv_store_build>() { + let mut rng = thread_rng(); + let key_dist = Uniform::new(K::MIN, K::MAX); + let value_dist = Uniform::new(V::MIN, V::MAX); + + let num = 20_000; + let mut keys: Vec = (&mut rng) + .sample_iter(key_dist) + .filter(|&x| x < 0 as K || x > 10_000 as K) // we want to test for false positives as + // well + .take(num) + .collect(); + keys.sort(); + + let values: Vec = (&mut rng).sample_iter(value_dist).take(num).collect(); + + { + // Test build + let kv_store = KV::build(keys.clone().into_iter().zip(values.clone().into_iter())); + + // Test searches + for i in 0..num { + assert_eq!(kv_store.search(keys[i]), Some(values[i])); + } + + for key in 0..10_000 { + assert_eq!(kv_store.search(key as K), None); + } + } + } + #[test] fn test_persisted_kv_store_1() -> limousine_engine::Result<()> { create_kv_store! { @@ -319,4 +352,39 @@ mod tests { test_kv_store::>(); } + + #[test] + + fn test_pgm_store_3() { + create_kv_store! { + name: PGMStore1, + layout: [ + btree_top(), + pgm(epsilon = 8), + pgm(epsilon = 8), + ] + } + + test_kv_store_build::>(); + } + + #[test] + fn test_pgm_store_9() { + create_kv_store! { + name: PGMStore1, + layout: [ + btree_top(), + pgm(epsilon = 8), + pgm(epsilon = 8), + pgm(epsilon = 8), + pgm(epsilon = 8), + pgm(epsilon = 8), + pgm(epsilon = 8), + pgm(epsilon = 8), + pgm(epsilon = 8), + ] + } + + test_kv_store_build::>(); + } } diff --git a/utils/gapped_array/Cargo.toml b/utils/gapped_array/Cargo.toml new file mode 100644 index 0000000..ba0c50c --- /dev/null +++ b/utils/gapped_array/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "gapped_array" +version = "0.1.0" +edition = "2021" +description = "A sorted array with gapped-based inserts" +authors = ["Mark Pekala "] +keywords = ["contained", "gapped-array", "no_std"] +license = "Apache-2.0" +repository = "https://github.com/LevKruglyak/limousine" + + +[dependencies] +slice_search = { path = "../slice_search", version = "0.1.2" } +serde = { version = "1.0.197", features = ["derive"] } +itertools = "0.12.1" +kdam = "0.5.1" + diff --git a/utils/gapped_array/src/lib.rs b/utils/gapped_array/src/lib.rs new file mode 100644 index 0000000..244e1e2 --- /dev/null +++ b/utils/gapped_array/src/lib.rs @@ -0,0 +1,774 @@ +use core::fmt; +use core::mem::MaybeUninit; +use std::mem::size_of; + +/// A sorted array which is constructed with intentional gaps to allow for practical in-place inserts +/// NOTE: The current implementation assumes keys are unique. It may break if this is not true. +/// NOTE: The current implementation is not heavily optimized. +#[derive(Debug)] +pub struct GappedKVArray +where + K: Ord, +{ + bitmap: Box<[bool]>, + keys: Box<[MaybeUninit]>, + vals: Box<[MaybeUninit]>, + size: usize, +} + +impl GappedKVArray +where + K: Ord, +{ + /// Creates an empty gapped array with the given size + pub fn new(size: usize) -> Self { + let bitmap_vec = vec![false; size]; + let mut keys_vec = Vec::>::with_capacity(size); + let mut vals_vec = Vec::>::with_capacity(size); + for _ in 0..size { + keys_vec.push(MaybeUninit::uninit()); + vals_vec.push(MaybeUninit::uninit()); + } + Self { + bitmap: bitmap_vec.into_boxed_slice(), + keys: keys_vec.into_boxed_slice(), + vals: vals_vec.into_boxed_slice(), + size: 0, + } + } + + /// The length of the gapped array (including gaps) + pub const fn len(&self) -> usize { + self.bitmap.len() + } + + /// The length of the gapped arraycannot move out of `self.vals[_]` which is behind a mutable reference (excluding gaps) + pub fn size(&self) -> usize { + self.size + } + + /// Is the gapped array full? + pub fn is_full(&self) -> bool { + self.len() <= self.size() + } + + /// The density of the gapped array + pub fn density(&self) -> f32 { + self.size as f32 / self.len() as f32 + } + + /// Helper function to implement next occupied and next free + /// TODO: Bithacks to make faster + fn next_ix_helper(&self, mut ix: usize, val: bool) -> Option { + while ix < self.len() && self.bitmap[ix] != val { + ix += 1 + } + if ix < self.len() { + Some(ix) + } else { + None + } + } + + /// Returns the next occupied slot in the range [ix, end] + fn next_occupied_ix(&self, ix: usize) -> Option { + self.next_ix_helper(ix, true) + } + + /// Returns the next free slot in the range [ix, end] + fn next_free_ix(&self, ix: usize) -> Option { + self.next_ix_helper(ix, false) + } + + /// Helper function to implement prev occupied and prev free + /// TODO: Bithacks to make faster + fn prev_ix_helper(&self, mut ix: usize, val: bool) -> Option { + loop { + if self.bitmap[ix] == val { + return Some(ix); + } + if ix == 0 { + return None; + } + ix -= 1; + } + } + + /// Returns the previous occupied slot in the range [start, ix] + fn prev_occupied_ix(&self, ix: usize) -> Option { + self.prev_ix_helper(ix, true) + } + + /// Returns the previous free slot in the range [start, ix] + fn prev_free_ix(&self, ix: usize) -> Option { + self.prev_ix_helper(ix, false) + } + + /// Returns the Some(ix) s.t. keys[ix] <= needle, and for all jx > ix, needle < keys[jx] + /// Returns None if needle is smaller than everything in the array + /// NOTE: Hint is just to help search speed. This ALWAYS returns a correct result. + fn price_is_right(&self, needle: &K, hint: Option) -> Option { + // First, move as far to the right as we can from the hint + let mut check = self.next_occupied_ix(hint.unwrap_or(self.len() / 2)); + while check.is_some() { + let next = self.next_occupied_ix(check.unwrap() + 1); + match next { + Some(next_ix) => { + unsafe { + if needle < self.keys[next_ix].assume_init_ref() { + break; + } + } + check = Some(next_ix); + } + None => break, + } + } + // Handle edge case where hint put us past the end of the array, snap back to left if there is an element + check = match check { + Some(ix) => Some(ix), + None => { + if self.len() > 0 { + self.prev_occupied_ix(self.len() - 1) + } else { + None + } + } + }; + // Then ensure correctness by moving left as far as we need to + while check.is_some() { + unsafe { + if self.keys[check.unwrap()].assume_init_ref() <= needle { + break; + } + } + if check.unwrap() == 0 { + check = None; + break; + } + check = self.prev_occupied_ix(check.unwrap() - 1); + } + check + } + + /// Search the gapped array for a specific value, returning the "price is right" value + /// (I.e. the biggest value without going over key) + /// TODO: Make exponential search + /// TODO: Update slice_search so it can work on gapped arrays + pub fn search_pir(&self, needle: &K, hint: Option) -> Option<&V> { + match self.price_is_right(needle, hint) { + Some(ix) => match self.vals.get(ix) { + Some(val) => unsafe { + return Some(val.assume_init_ref()); + }, + None => None, + }, + None => None, + } + } + + /// Search the gapped array for a specific value, using a starting hint + /// TODO: Make exponential search + pub fn search_exact(&self, needle: &K, hint: Option) -> Option<&V> { + match self.price_is_right(needle, hint) { + Some(ix) => unsafe { + if self.keys[ix].assume_init_ref() == needle { + match self.vals.get(ix) { + Some(val) => Some(val.assume_init_ref()), + None => None, + } + } else { + None + } + }, + None => None, + } + } + + /// Helper function to copy within for all the needed arrays + fn copy_within(&mut self, src: std::ops::Range, dest: usize) { + self.bitmap.copy_within(src.clone(), dest); + unsafe { + let key_src = self.keys.get_unchecked(src.start).as_ptr(); + let key_dest = self.keys.get_unchecked_mut(dest).as_mut_ptr(); + core::ptr::copy(key_src, key_dest, src.clone().count()); + + let val_src = self.vals.get_unchecked(src.start).as_ptr(); + let val_dest = self.vals.get_unchecked_mut(dest).as_mut_ptr(); + core::ptr::copy(val_src, val_dest, src.count()); + } + } + + /// Helper function to upsert an entry into a given location + fn upsert_at(&mut self, pair: (K, V), ix: usize) { + if !self.bitmap[ix] { + // Inserting a new element + self.size += 1; + } + self.bitmap[ix] = true; + self.keys[ix] = MaybeUninit::::new(pair.0); + self.vals[ix] = MaybeUninit::::new(pair.1); + } + + /// Helper function to remove an entry in a given location + fn remove_at(&mut self, ix: usize) -> Result<(K, V), String> { + if !self.bitmap[ix] { + Err("No such element exists for remove_at".to_string()) + } else { + self.bitmap[ix] = false; + let key = std::mem::replace(&mut self.keys[ix], MaybeUninit::uninit()); + let val = std::mem::replace(&mut self.vals[ix], MaybeUninit::uninit()); + self.size -= 1; + unsafe { Ok((key.assume_init(), val.assume_init())) } + } + } + + /// Upsert a specific value into the array with the given hint + pub fn upsert_with_hint(&mut self, pair: (K, V), hint: usize) -> Result<(), String> { + let maybe_ix = self.price_is_right(&pair.0, Some(hint)); + match maybe_ix { + None => { + // Edge case where upserting at the beginning + let Some(closest_ix) = self.next_free_ix(0) else { + return Err("Gapped array is full (beginning)".to_string()); + }; + self.copy_within(0..closest_ix, 1); + self.upsert_at(pair, 0); + Ok(()) + } + Some(mut ix) => { + unsafe { + if self.keys[ix].assume_init_ref() == &pair.0 { + // If this is an update handle it quickly and return + self.upsert_at(pair, ix); + return Ok(()); + } + } + if ix + 1 == self.len() { + // Edge case where upserting at the end + let Some(closest_ix) = self.prev_free_ix(self.len() - 1) else { + return Err("Gapped array is full (end)".to_string()); + }; + self.copy_within(closest_ix + 1..self.len(), closest_ix); + self.bitmap[self.len() - 1] = false; // So size is updated correctly + self.upsert_at(pair, self.len() - 1); + Ok(()) + } else { + // We're doing a "normal" upsert into the middle of the array + ix += 1; // Price-is-right quirk + if !self.bitmap[ix] { + // Easy win + self.upsert_at(pair, ix); + return Ok(()); + } + let shift_left_ix = self.prev_free_ix(ix - 1); + let shift_right_ix = self.next_free_ix(ix + 1); + match (shift_left_ix, shift_right_ix) { + (Some(lix), Some(rix)) => { + if lix.abs_diff(ix) < rix.abs_diff(ix) { + self.copy_within(lix + 1..ix + 1, lix); + self.bitmap[ix - 1] = false; // So size is updated correctly + self.upsert_at(pair, ix - 1); + Ok(()) + } else { + self.copy_within(ix..rix, ix + 1); + self.bitmap[ix] = false; // So size is updated correctly + self.upsert_at(pair, ix); + Ok(()) + } + } + (Some(lix), None) => { + self.copy_within(lix + 1..ix + 1, lix); + self.bitmap[ix - 1] = false; // So size is updated correctly + self.upsert_at(pair, ix - 1); + Ok(()) + } + (None, Some(rix)) => { + self.copy_within(ix..rix, ix + 1); + self.bitmap[ix] = false; // So size is updated correctly + self.upsert_at(pair, ix); + Ok(()) + } + _ => Err("Gapped array is full (_)".to_string()), + } + } + } + } + } + + /// Called to efficiently handle the initial upserts. NOTE: This makes two assumptions: + /// - The values themselves are monotonically increasing + /// - The hints are monotonically non-decreasing + /// If either of these assumptions break, bad stuff may happen (use regular upsert) + pub fn initial_model_based_insert(&mut self, pair: (K, V), hint: usize) -> Result<(), String> { + if !self.bitmap[hint] { + self.upsert_at(pair, hint); + return Ok(()); + } + match self.next_free_ix(hint + 1) { + Some(free_ix) => { + self.upsert_at(pair, free_ix); + Ok(()) + } + None => match self.prev_free_ix(self.len().saturating_sub(1)) { + Some(free_ix) => { + self.copy_within(free_ix + 1..self.len(), free_ix); + self.upsert_at(pair, self.len().saturating_sub(1)); + Ok(()) + } + None => Err("Gapped array is full".to_string()), + }, + } + } + + /// Finds an element with key `needle` and removes that element and up to `window_radius` + /// elements on each side. + pub fn trim_window( + &mut self, + needle: K, + window_radius: u32, + hint: usize, + ) -> Result, String> { + match self.price_is_right(&needle, Some(hint)) { + Some(ix) => { + unsafe { + if self.keys[ix].assume_init_ref() != &needle { + return Err("Can't trim window: supposed key doesn't exist".to_string()); + } + } + let mut in_order: Vec = vec![]; + // First add the actual element + let (_, v) = self.remove_at(ix).unwrap(); + in_order.push(v); + // Then get the elements to the left + if ix > 0 { + let mut num_left = 0; + let mut kx = self.prev_occupied_ix(ix - 1); + while let Some(jx) = kx { + let (_, v) = self.remove_at(jx).unwrap(); + in_order.insert(0, v); + if jx == 0 { + break; + } + kx = self.prev_occupied_ix(jx - 1); + num_left += 1; + if window_radius <= num_left { + break; + } + } + } + // Then get the elements to the right + let mut num_right = 0; + let mut kx = self.next_occupied_ix(ix + 1); + while let Some(jx) = kx { + let (_, v) = self.remove_at(jx).unwrap(); + in_order.push(v); + num_right += 1; + if window_radius <= num_right { + break; + } + kx = self.next_occupied_ix(jx + 1); + } + Ok(in_order) + } + None => Err("Can't trim window: supposed key doesn't exist".to_string()), + } + } + + /// Keep the same elements and relative spacing but create more array space and replace as needed + pub fn rescale(&mut self, c: f32) -> Result<(), String> { + if c <= 1.0 { + return Err("Must scale by a constant c > 1.0".to_string()); + } + let new_size = (self.len() as f32 * c) as usize; + let mut temp = Self::new(new_size); + for ix in 0..self.len() { + if !self.bitmap[ix] { + continue; + } + unsafe { + let key = std::mem::replace(&mut self.keys[ix], MaybeUninit::uninit()); + let val = std::mem::replace(&mut self.vals[ix], MaybeUninit::uninit()); + let Ok(_) = temp.initial_model_based_insert( + (key.assume_init(), val.assume_init()), + (ix as f32 * c) as usize, + ) else { + return Err("Failed to re-insert data after scaling up".to_string()); + }; + } + } + self.bitmap = temp.bitmap; + self.vals = temp.vals; + self.keys = temp.keys; + Ok(()) + } + + /// The total size of this gapped array + pub fn size_in_bytes(&self) -> u128 { + (size_of::() + (size_of::() + size_of::() + 1) * self.len()) as u128 + } + + /// The total _excess_ size of this gapped array. I.e. how many bytes are needed + /// that are _NOT_ storing the actual data in the index + pub fn excess_size_in_bytes(&self) -> u128 { + let mut num_unoccupied = 0; + for val in self.bitmap.iter() { + if !*val { + num_unoccupied += 1; + } + } + // Bitmap + k,v-size * num not occupied + (self.len() + (size_of::() + size_of::()) * num_unoccupied) as u128 + } + + /// The minimum key in this array, or None if it's empty + pub fn min(&self) -> Option<&K> { + match self.next_occupied_ix(0) { + Some(ix) => match self.keys.get(ix) { + Some(key) => unsafe { Some(key.assume_init_ref()) }, + None => None, + }, + None => None, + } + } + + /// The minimum key in this array, or None if it's empty + pub fn min_val(&self) -> Option<&V> { + match self.next_occupied_ix(0) { + Some(ix) => match self.vals.get(ix) { + Some(val) => unsafe { Some(val.assume_init_ref()) }, + None => None, + }, + None => None, + } + } +} + +impl fmt::Display for GappedKVArray +where + K: Default + Clone + Ord + std::fmt::Debug, + V: Default + Clone + Ord + std::fmt::Debug, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut res = String::new(); + res += &format!( + "[len: {}, size: {}, density: {}\n", + self.len(), + self.size(), + self.density() + ); + for ix in 0..self.len() { + if !self.bitmap[ix] { + res += " None,\n"; + } else { + res += &format!(" ({:?}, {:?}),\n", self.keys[ix], self.vals[ix]); + } + } + res += " ]"; + write!(f, "{}", res) + } +} + +#[cfg(test)] +mod gapped_array_tests { + use super::*; + use itertools::Itertools; + use kdam::{tqdm, BarExt}; + + #[allow(unused)] + fn print_gapped_array(ga: &GappedKVArray) { + let mut line1 = String::new(); + let mut line2 = String::new(); + let mut line3 = String::new(); + for ix in 0..ga.len() { + line1 += &format!("{}", if ga.bitmap[ix] { 1 } else { 0 }); + line2 += &format!("{:?}", ga.keys[ix]); + line3 += &format!("{:?}", ga.vals[ix]); + } + println!("bitmap: {}", &line1); + println!("keys: {}", &line2); + println!("vals: {}", &line3); + } + + fn fill_forward_with_hint(size: usize, hint: usize) { + let mut ga = GappedKVArray::::new(size); + for num in 0..size { + let result = ga.upsert_with_hint((num as i32, num as i32), hint); + assert!(result.is_ok()); + // print_gapped_array(&ga); + } + for ix in 0..size { + assert!(ga.bitmap[ix]); + unsafe { + assert!(ga.keys[ix].assume_init() == ix as i32); + } + } + } + + #[test] + fn fill_forward() { + const SIZE: usize = 10; + for hint in 0..SIZE { + fill_forward_with_hint(SIZE, hint); + } + } + + fn fill_backward_with_hint(size: usize, hint: usize) { + let mut ga = GappedKVArray::::new(size); + for num in 0..size { + let result = + ga.upsert_with_hint(((size - num - 1) as i32, (size - num - 1) as i32), hint); + assert!(result.is_ok()); + } + for ix in 0..size { + assert!(ga.bitmap[ix]); + unsafe { + assert!(ga.keys[ix].assume_init() == ix as i32); + } + } + } + + #[test] + fn fill_backward() { + const SIZE: usize = 100; + for hint in 0..SIZE { + fill_backward_with_hint(SIZE, hint); + } + } + + fn get_all_possible_hints(size: usize, num_hints: usize) -> Vec> { + if num_hints == 0 { + return vec![]; + } + if num_hints == 1 { + return (0..size).into_iter().map(|val| vec![val]).collect(); + } + let mut result: Vec> = vec![]; + for first_val in 0..size { + let tails = get_all_possible_hints(size, num_hints - 1); + for tail in tails { + let mut new_thing = vec![first_val]; + new_thing.extend(tail.into_iter()); + result.push(new_thing); + } + } + result + } + + fn test_perm_with_hints(perm: &Vec, hints: &Vec) { + let mut ga = GappedKVArray::::new(perm.len()); + for (value, hint) in perm.iter().zip(hints.iter()) { + assert!(ga + .upsert_with_hint((value.clone(), value.clone()), hint.clone()) + .is_ok()); + } + for ix in 0..ga.len() { + let good = unsafe { ga.bitmap[ix] && ga.keys[ix].assume_init() == ix as i32 }; + if !good { + // println!("Perm: {:?}", perm); + // println!("Hints: {:?}", hints); + // print_gapped_array(&ga); + } + assert!(good); + } + } + + #[test] + fn permutation_test() { + const SIZE: usize = 6; + let items: Vec = (0..SIZE).into_iter().map(|val| val as i32).collect(); + let perms: Vec> = items.into_iter().permutations(SIZE).collect(); + let hints = get_all_possible_hints(SIZE, SIZE); + let mut pb = tqdm!(total = perms.len() * hints.len()); + for perm in perms.iter() { + for hints in hints.iter() { + test_perm_with_hints(perm, hints); + pb.update(1).ok(); + } + } + } + + #[test] + fn debug_gapped() { + let perm = vec![1, 2, 0, 3, 4]; + let hints = vec![0, 0, 3, 0, 0]; + let mut ga = GappedKVArray::::new(perm.len()); + // print_gapped_array(&ga); + for (value, hint) in perm.iter().zip(hints.iter()) { + assert!(ga + .upsert_with_hint((value.clone(), value.clone()), hint.clone()) + .is_ok()); + // println!(""); + // print_gapped_array(&ga); + } + } + + unsafe fn test_nondec_seq(items: &Vec, hints: &Vec) { + let mut ga = GappedKVArray::::new(items.len()); + for (value, hint) in items.iter().zip(hints.iter()) { + assert!(ga + .initial_model_based_insert((value.clone(), value.clone()), hint.clone()) + .is_ok()); + } + for ix in 0..ga.len() { + let good = ga.bitmap[ix] && ga.keys[ix].assume_init() == ix as i32; + if !good { + // println!("Items: {:?}", items); + // println!("Hints: {:?}", hints); + // print_gapped_array(&ga); + } + assert!(good); + } + } + + #[test] + fn initial_inserts() { + const SIZE: usize = 6; + let items: Vec = (0..SIZE).into_iter().map(|val| val as i32).collect(); + let mut sequences = get_all_possible_hints(SIZE, SIZE); + sequences.retain(|seq| { + let mut last: Option = None; + for thing in seq.iter() { + if last.is_some() && *thing < last.unwrap() { + return false; + } + last = Some(*thing); + } + true + }); + for seq in sequences { + unsafe { + test_nondec_seq(&items, &seq); + } + } + } + + #[test] + fn update_gapped_array() { + const SIZE: usize = 6; + let keys = vec![0, 1, 2, 3, 2, 3]; + let vals = vec![10, 11, 22, 33, 42, 53]; + let all_hints = get_all_possible_hints(SIZE, SIZE); + let mut ga = GappedKVArray::::new(SIZE + 1); + let final_keys = vec![0, 1, 2, 3]; + let final_vals = vec![10, 11, 42, 53]; + for hints in all_hints { + for ((key, val), hint) in (keys.iter().zip(vals.iter())).zip(hints.iter()) { + assert!(ga + .upsert_with_hint((key.clone(), val.clone()), hint.clone()) + .is_ok()); + } + for (ix, (key, val)) in ga + .keys + .iter() + .zip(ga.vals.iter()) + .enumerate() + .take(final_keys.len()) + { + unsafe { + assert!(key.assume_init() == final_keys[ix]); + assert!(val.assume_init() == final_vals[ix]); + } + } + } + } + + #[test] + fn trim_gapped_array() { + const SIZE: usize = 6; + let get_fresh_ga = || { + let keys = vec![0, 1, 2, 3, 4, 5]; + let vals = vec![0, 1, 2, 3, 4, 5]; + let mut ga = GappedKVArray::::new(SIZE); + for (key, val) in keys.iter().zip(vals.iter()) { + ga.upsert_with_hint((*key, *val), 3).unwrap(); + } + ga + }; + for hint in 0..SIZE { + // Trim in the middle + let mut mid_ga = get_fresh_ga(); + mid_ga.trim_window(2, 1, hint).unwrap(); + let expected_keys = vec![0, 0, 0, 0, 4, 5]; + let expected_vals = vec![0, 0, 0, 0, 4, 5]; + let expected_bitmap = vec![true, false, false, false, true, true]; + for ix in 0..SIZE { + assert!(mid_ga.bitmap[ix] == expected_bitmap[ix]); + if mid_ga.bitmap[ix] { + unsafe { + assert!(mid_ga.keys[ix].assume_init() == expected_keys[ix]); + assert!(mid_ga.vals[ix].assume_init() == expected_vals[ix]); + } + } + } + } + for hint in 0..SIZE { + // Trim with clipping at both sides + let mut mid_ga = get_fresh_ga(); + mid_ga.trim_window(2, u32::MAX, hint).unwrap(); + let expected_keys = vec![0, 0, 0, 0, 0, 0]; + let expected_vals = vec![0, 0, 0, 0, 0, 0]; + let expected_bitmap = vec![false, false, false, false, false, false]; + for ix in 0..SIZE { + assert!(mid_ga.bitmap[ix] == expected_bitmap[ix]); + if mid_ga.bitmap[ix] { + unsafe { + assert!(mid_ga.keys[ix].assume_init() == expected_keys[ix]); + assert!(mid_ga.vals[ix].assume_init() == expected_vals[ix]); + } + } + } + } + for hint in 0..SIZE { + // Trim from beginning + let mut front_ga = get_fresh_ga(); + front_ga.trim_window(0, 1, hint).unwrap(); + let expected_keys = vec![0, 0, 2, 3, 4, 5]; + let expected_vals = vec![0, 0, 2, 3, 4, 5]; + let expected_bitmap = vec![false, false, true, true, true, true]; + for ix in 0..SIZE { + assert!(front_ga.bitmap[ix] == expected_bitmap[ix]); + if front_ga.bitmap[ix] { + unsafe { + assert!(front_ga.keys[ix].assume_init() == expected_keys[ix]); + assert!(front_ga.vals[ix].assume_init() == expected_vals[ix]); + } + } + } + } + for hint in 0..SIZE { + // Trim from end + let mut end_ga = get_fresh_ga(); + end_ga + .trim_window((end_ga.len() - 1) as i32, 1, hint) + .unwrap(); + let expected_keys = vec![0, 1, 2, 3, 0, 0]; + let expected_vals = vec![0, 1, 2, 3, 0, 0]; + let expected_bitmap = vec![true, true, true, true, false, false]; + for ix in 0..SIZE { + assert!(end_ga.bitmap[ix] == expected_bitmap[ix]); + if end_ga.bitmap[ix] { + unsafe { + assert!(end_ga.keys[ix].assume_init() == expected_keys[ix]); + assert!(end_ga.vals[ix].assume_init() == expected_vals[ix]); + } + } + } + } + } + + #[test] + fn debug_initial_gapped() { + let perm = vec![0, 1, 2, 3, 4, 5]; + let hints = vec![0, 0, 0, 4, 4, 4]; + let mut ga = GappedKVArray::::new(perm.len()); + // print_gapped_array(&ga); + for (value, hint) in perm.iter().zip(hints.iter()) { + assert!(ga + .initial_model_based_insert((value.clone(), value.clone()), hint.clone()) + .is_ok()); + // println!(""); + // print_gapped_array(&ga); + } + } +} diff --git a/utils/learned_segment/Cargo.toml b/utils/learned_segment/Cargo.toml new file mode 100644 index 0000000..4dcd9d6 --- /dev/null +++ b/utils/learned_segment/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "learned_index_segmentation" +version = "0.1.0" +edition = "2021" +description = "A collection of algorithms for building piecewise models for learned indices." +authors = ["Mark Pekala ", "Lev Kruglyak "] +keywords = [] +license = "Apache-2.0" +repository = "https://github.com/LevKruglyak/limousine" + +[dev-dependencies] +rand = "0.8.5" +rand_distr = "0.4.3" + +[dependencies] +num = "0.4.2" +serde = { version = "1.0.197", features = ["derive"] } diff --git a/utils/learned_segment/src/lib.rs b/utils/learned_segment/src/lib.rs new file mode 100644 index 0000000..90849a6 --- /dev/null +++ b/utils/learned_segment/src/lib.rs @@ -0,0 +1,6 @@ +mod model; +mod point; +mod segmentation; + +pub use model::LinearModel; +pub use segmentation::linear_simple_segmentation; diff --git a/utils/learned_segment/src/model.rs b/utils/learned_segment/src/model.rs new file mode 100644 index 0000000..5b98ea7 --- /dev/null +++ b/utils/learned_segment/src/model.rs @@ -0,0 +1,95 @@ +//! This file defines the Model portion of the PGM, which is simply just a +//! linear approximator. +//! +//! NOTE: We are making a simplification and forcing approximation lines +//! to pass through the origin, which slightly degrades performance + +use num::PrimInt; +use serde::{Deserialize, Serialize}; + +/// A simple linear model for a key-rank segment of data. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct LinearModel { + /// Define the approximation line. See note at top of file about forcing + /// approximations to pass through the origin. + pub(crate) key: K, + pub(crate) slope: f64, + + /// How many entries are indexed by this model. Not strictly needed but + /// useful for debugging. + pub(crate) size: usize, +} + +impl LinearModel { + /// Construct a new model from the smallest key, slope, and size + pub fn new(key: K, slope: f64, size: usize) -> Self { + debug_assert!(slope.is_normal()); + Self { key, slope, size } + } + + /// Approximation logic for linear models + pub fn approximate(&self, key: &K) -> (usize, usize) { + let run = num::cast::(key.clone().saturating_sub(self.key)).unwrap(); + let pos = (run * self.slope).floor() as i64; + let pos = pos.max(0) as usize; + + (pos.saturating_sub(EPSILON), pos + EPSILON + 2) + } + + /// Instead of returning a window'd approximation, return a hint, which + /// is better for gapped arrays with exponential search + /// (I.e., it's a hint for where to _start_ searching for the element, not + /// a window which is guaranteed to hold the value) + pub fn hint(&self, key: &K) -> usize { + let run = num::cast::(key.clone().saturating_sub(self.key)).unwrap(); + let pos = (run * self.slope).floor() as i64; + pos.max(0) as usize + } + + /// Construct a sentinel model which will sit at the end of a layer + pub fn sentinel() -> Self { + Self { + key: K::max_value(), + slope: 0.0, + size: 0, + } + } + + /// Rescales the slope of the model + pub fn rescale(&mut self, c: f64) { + self.slope *= c; + } +} + +impl LinearModel { + pub fn min_key(&self) -> &K { + &self.key + } +} + +// impl + +// Simple component with simple test(s) +#[cfg(test)] +mod pgm_model_tests { + use super::*; + + #[test] + fn pgm_model_basic() { + const EPS: usize = 2; + let key: usize = 10; + let slope: f64 = 1.0; + let slope_usize: usize = 1; + let model: LinearModel = LinearModel::new(key, slope, 6); + + for test in 20..1000 { + let test: usize = test; + let approx = model.approximate(&test); + let expected_lo = (test - key) * slope_usize - EPS; + let expected_hi = expected_lo + EPS * 2 + 2; + + assert!(approx.0 == expected_lo); + assert!(approx.1 == expected_hi); + } + } +} diff --git a/utils/learned_segment/src/point.rs b/utils/learned_segment/src/point.rs new file mode 100644 index 0000000..d7d2a2b --- /dev/null +++ b/utils/learned_segment/src/point.rs @@ -0,0 +1,29 @@ +use std::ops::Sub; + +use num::PrimInt; + +#[derive(Clone)] +pub struct Point { + x: K, + y: i32, +} + +impl Point { + pub fn new(x: K, y: i32) -> Self { + Self { x, y } + } + + /// Slope of the line connecting (0,0) to this point. + pub fn slope(self) -> f64 { + let run = num::cast::(self.x).unwrap(); + (self.y as f64) / run + } +} + +impl Sub for Point { + type Output = Self; + + fn sub(self, rhs: Self) -> Self::Output { + Point::new(self.x.saturating_sub(rhs.x), self.y.saturating_sub(rhs.y)) + } +} diff --git a/utils/learned_segment/src/segmentation.rs b/utils/learned_segment/src/segmentation.rs new file mode 100644 index 0000000..e60ed8f --- /dev/null +++ b/utils/learned_segment/src/segmentation.rs @@ -0,0 +1,270 @@ +use num::PrimInt; + +use crate::{model::LinearModel, point::Point}; + +/// A data structure that will grow to incorporate points while building a PGM and eventually +/// produce a proper linear model, before moving on to the next one +pub struct LinearSimpleSegmentation { + pub first_key: Option, + pub entries: Vec<(K, V)>, + pub max_slope: f64, + pub min_slope: f64, + pub num_entries: usize, + // // For sanity checking that input is increasing + // #[cfg(debug_assertions)] + // last_key: Option, +} + +impl LinearSimpleSegmentation { + pub fn new() -> Self { + Self { + first_key: None, + entries: Vec::new(), + max_slope: f64::MAX, + min_slope: f64::MIN, + num_entries: 0, + // #[cfg(debug_assertions)] + // last_key: None, + } + } + + /// Tries to add an entry to this segmentor, returning a result about whether it was + /// successful. + fn try_add_entry(&mut self, entry: (K, V)) -> Result<(), (K, V)> { + if self.num_entries == 0 { + // If it's empty just add the point + self.first_key = Some(entry.0); + self.entries = vec![entry]; + self.num_entries = 1; + + return Ok(()); + } + + // Sanity checks + #[cfg(debug_assertions)] + { + debug_assert!(self.first_key.is_some()); + debug_assert!(self.entries.len() == self.num_entries); + // debug_assert!(self.last_key.is_some()); + // debug_assert!(self.last_key.clone().unwrap() < entry.0); + } + + // Get the worst case points we care about + let base_point = Point::new(self.first_key.clone().unwrap(), 0); + let max_point = Point::new( + entry.0, + self.num_entries + .saturating_add(1) // The actual rank + .saturating_sub(1) // To deal with floating point annoyances + .saturating_add(EPSILON) as i32, + ); + let min_point = Point::new( + entry.0, + self.num_entries + .saturating_add(1) // The actual rank + .saturating_add(1) // To deal with floating point annoyances + .saturating_sub(EPSILON) as i32, + ); + let this_max = (max_point - base_point.clone()).slope(); + let this_min = (min_point - base_point.clone()).slope(); + + if self.num_entries == 1 { + self.max_slope = this_max; + self.min_slope = this_min; + } else { + let new_max_slope = this_max.min(self.max_slope); + let new_min_slope = this_min.max(self.min_slope); + if new_min_slope >= new_max_slope { + // We can't fit this point in the model + return Err(entry); + } + // SANITY TESTING + #[cfg(debug_assertions)] + { + // Max slope should be monotonically decreasing + debug_assert!(new_max_slope <= self.max_slope); + // Min slope should be monotonically increasing + debug_assert!(new_min_slope >= self.min_slope); + } + + self.max_slope = new_max_slope; + self.min_slope = new_min_slope; + } + + // This point is fine to add, and we've already update the slope + self.num_entries += 1; + self.entries.push(entry); + + Ok(()) + } + + // Outputs a linear model that fits all the points presented so far + pub fn to_linear_model(&self) -> LinearModel { + assert!(self.first_key.is_some()); + assert!(self.num_entries > 0); + + let slope = if self.num_entries > 1 { + (self.max_slope + self.min_slope) / 2.0 + } else { + // A model that only has one point can pick any slope, we pick 1 arbitrarily + 1.0 + }; + + LinearModel::new(self.first_key.unwrap(), slope, self.num_entries) + } + + /// Takes ownership of the entires generating this linear model + pub fn take_entries(&mut self) -> Vec<(K, V)> { + std::mem::replace(&mut self.entries, vec![]) + } + + pub fn is_empty(&self) -> bool { + self.num_entries == 0 + } +} + +#[must_use] +pub fn linear_simple_segmentation( + data: impl Iterator, +) -> Vec<(LinearModel, Vec<(K, V)>)> { + let mut result: Vec<(LinearModel, Vec<(K, V)>)> = vec![]; + + let mut cur_segment: LinearSimpleSegmentation = LinearSimpleSegmentation::new(); + + for entry in data { + match cur_segment.try_add_entry(entry) { + Ok(_) => { + // Nothing to do, entry added successfully + } + Err(entry) => { + // Export the model currently specified by the segmentor + result.push((cur_segment.to_linear_model(), cur_segment.take_entries())); + // Reset current segmentor + cur_segment = LinearSimpleSegmentation::new(); + + // Should always be ok since adding the first entry is fine + cur_segment.try_add_entry(entry).ok(); + } + } + } + + // Handle last segment + if !cur_segment.is_empty() { + result.push((cur_segment.to_linear_model(), cur_segment.take_entries())); + } + + result +} + +/// We'll test this part just by initializing TONS of indexes and making sure every key in every index is +/// properly indexed +#[cfg(test)] +mod pgm_segmentation_tests { + use rand::{distributions::Uniform, Rng}; + + use super::*; + + type Key = usize; + type Value = usize; + + /// To test with different epsilon we need a struct that can handle that generic + struct PGMSegTestCase { + verbose: bool, + entries: Vec<(Key, Value)>, + models: Vec>, + values: Vec, + last_model_ix: usize, + last_base_rank: usize, + } + + impl PGMSegTestCase { + /// Generates a test key, meaning make the entries, sort + dedup them + fn generate(size: usize, verbose: Option) -> Self { + let verbose = verbose.unwrap_or(true); + if verbose { + println!("Generating {} entries with eps={}", size, EPSILON); + } + let range = Uniform::from((Key::MIN)..(Key::MAX)); + let mut random_values: Vec = + rand::thread_rng().sample_iter(&range).take(size).collect(); + random_values.sort(); + random_values.dedup(); + let entries: Vec<(Key, Value)> = random_values + .into_iter() + .enumerate() + .map(|(ix, key)| (key, ix)) + .collect(); + + Self { + entries, + verbose, + models: vec![], + values: vec![], + last_model_ix: 0, + last_base_rank: 0, + } + } + + /// Assuming data has already been generated, segments it as a layer + fn train(&mut self) { + if self.verbose { + println!( + "Training on {} entries with eps={}", + self.entries.len(), + EPSILON + ); + } + let trained: Vec<(LinearModel, Vec<(Key, Value)>)> = + linear_simple_segmentation(self.entries.clone().into_iter()); + + self.models.clear(); + self.values.clear(); + trained.into_iter().for_each(|(model, values)| { + self.models.push(model); + self.values.push(values[0].0); + }); + } + + /// Helper function for determining if a single entry is approximated within bounds + fn is_entry_well_approximated(&mut self, entry: (Key, Value)) -> bool { + let mut model_ix = self.last_model_ix; + let mut base_rank = self.last_base_rank; + while model_ix < self.models.len().saturating_sub(1) { + if self.models[model_ix + 1].key > entry.0 { + break; + } + + base_rank += self.models[model_ix].size; + model_ix += 1; + } + let range = self.models[model_ix].approximate(&entry.0); + self.last_base_rank = base_rank; + self.last_model_ix = model_ix; + return base_rank + range.0 <= entry.1 && entry.1 < base_rank + range.1; + } + + /// Assuming data has already been generated and trained on, tests that every key is correctly approximated + fn test(&mut self) { + for entry in self.entries.clone() { + assert!(self.is_entry_well_approximated(entry)); + } + } + } + + /// Test with different epsilons + macro_rules! test_eps { + ($fname: ident, $val: expr) => { + #[test] + fn $fname() { + let mut test_case: PGMSegTestCase<$val> = + PGMSegTestCase::generate(10_000_000, None); + test_case.train(); + test_case.test(); + } + }; + } + test_eps!(test_eps4, 4); + test_eps!(test_eps8, 8); + test_eps!(test_eps16, 16); + test_eps!(test_eps64, 64); +}