From 4586d21d85c282f14b90017e008bc9483f112c92 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Tue, 30 Jan 2024 11:49:11 +0900 Subject: [PATCH 01/24] Significantly reduce allocations --- Cargo.lock | 399 ++++++++++++++++++++++++-------- Cargo.toml | 13 +- README.md | 2 +- benches/tuning_parameters.rs | 46 ---- src/cmd/profiling.rs | 35 ++- src/cmd/timings.rs | 2 +- src/counts.rs | 358 ++++++++++++++++++++++++++++ src/lib.rs | 4 + src/radix_sort_builder.rs | 1 + src/sorter.rs | 180 +++++++++----- src/sorts/comparative_sort.rs | 8 +- src/sorts/lsb_sort.rs | 228 +++++++++++------- src/sorts/mod.rs | 13 -- src/sorts/mt_lsb_sort.rs | 90 +++---- src/sorts/out_of_place_sort.rs | 160 ++++++------- src/sorts/recombinating_sort.rs | 127 +++++----- src/sorts/regions_sort.rs | 119 ++++++---- src/sorts/scanning_sort.rs | 49 ++-- src/sorts/ska_sort.rs | 47 ++-- src/utils/sort_utils.rs | 273 ++++------------------ src/utils/test_utils.rs | 14 +- 21 files changed, 1341 insertions(+), 827 deletions(-) delete mode 100644 benches/tuning_parameters.rs create mode 100644 src/counts.rs diff --git a/Cargo.lock b/Cargo.lock index 2777b2f..b524a4c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "aho-corasick" version = "1.1.2" @@ -26,8 +41,9 @@ checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "arbitrary-chunks" version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ad8689a486416c401ea15715a4694de30054248ec627edbf31f49cb64ee4086" +dependencies = [ + "rayon", +] [[package]] name = "autocfg" @@ -35,11 +51,32 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" -version = "2.4.1" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" [[package]] name = "block-pseudorand" @@ -89,9 +126,9 @@ dependencies = [ [[package]] name = "ciborium" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ "ciborium-io", "ciborium-ll", @@ -100,15 +137,15 @@ dependencies = [ [[package]] name = "ciborium-io" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" [[package]] name = "ciborium-ll" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ "ciborium-io", "half", @@ -116,18 +153,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.4.8" +version = "4.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2275f18819641850fa26c89acc84d465c1bf91ce57bc2748b28c420473352f64" +checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.4.8" +version = "4.4.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07cdf1b148b25c1e1f7a42225e30a0d99a615cd4637eae7365548dd4529b95bc" +checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" dependencies = [ "anstyle", "clap_lex", @@ -177,35 +214,49 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" -version = "0.9.15" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "autocfg", - "cfg-if", "crossbeam-utils", - "memoffset", - "scopeguard", ] [[package]] name = "crossbeam-utils" -version = "0.8.16" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "crunchy" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a22b2d63d4d1dc0b7f1b6b2747dd0088008a9be28b6ddf0b1e7d335e3037294" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "dhat" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2aaf837aaf456f6706cb46386ba8dffd4013a757e36f4ea05c20dd46b209a3" dependencies = [ - "cfg-if", + "backtrace", + "lazy_static", + "mintex", + "parking_lot", + "rustc-hash", + "serde", + "serde_json", + "thousands", ] [[package]] @@ -216,31 +267,41 @@ checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" [[package]] name = "errno" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f258a7194e7f7c2a7837a8913aeab7fd8c383457034fa20ce4dd3dcb813e8eb8" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" dependencies = [ "libc", "windows-sys", ] +[[package]] +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + [[package]] name = "half" -version = "1.8.2" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", +] [[package]] name = "hermit-abi" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" +checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" [[package]] name = "is-terminal" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" +checksum = "0bad00257d07be169d870ab665980b06cdb366d792ad690bf2e76876dc503455" dependencies = [ "hermit-abi", "rustix", @@ -258,30 +319,46 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.9" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "js-sys" -version = "0.3.65" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54c0c35952f67de54bb584e9fd912b3023117cbafc0a77d8f3dee1fb5f572fe8" +checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" -version = "0.2.150" +version = "0.2.152" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" [[package]] name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "lock_api" version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "969488b55f8ac402214f3f5fd243ebb7206cf82de60d3172994707a4bcc2b829" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] [[package]] name = "log" @@ -291,19 +368,25 @@ checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "memchr" -version = "2.6.4" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] -name = "memoffset" -version = "0.9.0" +name = "miniz_oxide" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" dependencies = [ - "autocfg", + "adler", ] +[[package]] +name = "mintex" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bec4598fddb13cc7b528819e697852653252b760f1228b7642679bf2ff2cd07" + [[package]] name = "nanorand" version = "0.6.1" @@ -319,11 +402,20 @@ dependencies = [ "autocfg", ] +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "oorandom" @@ -331,6 +423,29 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + [[package]] name = "partition" version = "0.1.2" @@ -367,27 +482,27 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" -version = "1.8.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c27db03db7734835b3f53954b534c91069375ce6ccaa2e065441e07d9b6cdb1" +checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" dependencies = [ "either", "rayon-core", @@ -395,9 +510,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ce3fb6ad83f861aac485e76e1985cd109d9a3713802152be56c3b1f0e0658ed" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -409,18 +524,29 @@ version = "0.20.12" dependencies = [ "arbitrary-chunks", "block-pseudorand", + "bumpalo", "criterion", + "dhat", "partition", "rayon", "tikv-jemallocator", "voracious_radix_sort", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" -version = "1.10.2" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -430,9 +556,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", @@ -445,13 +571,25 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" -version = "0.38.24" +version = "0.38.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ad981d6c340a49cdc40a1028d9c6084ec7e9fa33fcb839cab656a267071e234" +checksum = "322394588aaf33c24007e8bb3238ee3e4c5c09c084ab32bc73890b99ff326bca" dependencies = [ - "bitflags", + "bitflags 2.4.2", "errno", "libc", "linux-raw-sys", @@ -460,9 +598,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "same-file" @@ -481,18 +619,18 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "serde" -version = "1.0.192" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bca2a08484b285dcb282d0f67b26cadc0df8b19f8c12502c13d966bf9482f001" +checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.192" +version = "1.0.196" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6c7207fbec9faa48073f3e3074cbe553af6ea512d7c21ba46e434e70ea9fbc1" +checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" dependencies = [ "proc-macro2", "quote", @@ -501,26 +639,38 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.108" +version = "1.0.112" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" +checksum = "4d1bd37ce2324cf3bf85e5a25f96eb4baf0d5aa6eba43e7ae8958870c4ec48ed" dependencies = [ "itoa", "ryu", "serde", ] +[[package]] +name = "smallvec" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" + [[package]] name = "syn" -version = "2.0.39" +version = "2.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "thousands" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + [[package]] name = "tikv-jemalloc-sys" version = "0.5.4+5.3.0-patched" @@ -578,9 +728,9 @@ dependencies = [ [[package]] name = "wasm-bindgen" -version = "0.2.88" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7daec296f25a1bae309c0cd5c29c4b260e510e6d813c286b19eaadf409d40fce" +checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -588,9 +738,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.88" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e397f4664c0e4e428e8313a469aaa58310d302159845980fd23b0f22a847f217" +checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" dependencies = [ "bumpalo", "log", @@ -603,9 +753,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.88" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5961017b3b08ad5f3fe39f1e79877f8ee7c23c5e5fd5eb80de95abc41f1f16b2" +checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -613,9 +763,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.88" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5353b8dab669f5e10f5bd76df26a9360c748f054f862ff5f3f8aae0c7fb3907" +checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" dependencies = [ "proc-macro2", "quote", @@ -626,15 +776,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.88" +version = "0.2.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d046c5d029ba91a1ed14da14dca44b68bf2f124cfbaf741c54151fdb3e0750b" +checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" [[package]] name = "web-sys" -version = "0.3.65" +version = "0.3.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5db499c5f66323272151db0e666cd34f78617522fb0c1604d31a27c50c206a85" +checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" dependencies = [ "js-sys", "wasm-bindgen", @@ -673,11 +823,11 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-sys" -version = "0.48.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.0", ] [[package]] @@ -686,13 +836,28 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", ] [[package]] @@ -701,38 +866,80 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" diff --git a/Cargo.toml b/Cargo.toml index b2f8cb6..0d268cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "rdst" -description = "A flexible parallel unstable radix sort that supports sorting by any arbitrarily defined sequence of bytes." +description = "A flexible, parallel, unstable radix sort. Sort arbitrary types in whatever byte order you wish... or just sort numbers very fast!" version = "0.20.12" authors = ["Nathan Essex "] -edition = "2018" +edition = "2021" license = "Apache-2.0 OR MIT" repository = "https://github.com/Nessex/rdst" homepage = "https://github.com/Nessex/rdst" @@ -20,13 +20,15 @@ timings = ["multi-threaded"] [dependencies] rayon = { version = "1.8", optional = true } -arbitrary-chunks = "0.4.1" +arbitrary-chunks = { path = "../arbitrary-chunks" } partition = "0.1.2" +bumpalo = { version = "3.14.0", features = ["collections"] } [dev-dependencies] rayon = "1.8" criterion = "0.5.1" block-pseudorand = "0.1.2" +dhat = "0.3.2" [target.'cfg(all(not(target_env = "msvc"), tuning))'.dependencies] tikv-jemallocator = "0.5.4" @@ -57,11 +59,6 @@ name = "struct_sort" harness = false required-features = ["multi-threaded"] -[[bench]] -name = "tuning_parameters" -harness = false -required-features = ["multi-threaded"] - [[bin]] # Requires: RUSTFLAGS="--cfg bench --cfg tuning" AND --features profiling # Suggestions for a better alternative very welcome... diff --git a/README.md b/README.md index 7427e69..cab50c6 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ![Crates.io](https://img.shields.io/crates/l/rdst?style=flat-square) ![Crates.io](https://img.shields.io/crates/v/rdst?style=flat-square) -rdst is a flexible native Rust implementation of multi-threaded unstable radix sort. +rdst is a flexible native Rust implementation of multithreaded unstable radix sort. ## Usage diff --git a/benches/tuning_parameters.rs b/benches/tuning_parameters.rs deleted file mode 100644 index 5219da3..0000000 --- a/benches/tuning_parameters.rs +++ /dev/null @@ -1,46 +0,0 @@ -use criterion::*; -use rayon::current_num_threads; -use rdst::utils::bench_utils::bench_common; -use rdst::utils::*; -use std::cmp::max; - -fn tune_counts(c: &mut Criterion) { - let tests: Vec<(&str, Box)>)> = vec![ - ( - "get_counts", - Box::new(|input: Vec<_>| { - let (c, _) = get_counts(&input, 0); - black_box(c); - }), - ), - ( - "par_get_counts", - Box::new(|input: Vec<_>| { - let (c, _) = par_get_counts(&input, 0); - black_box(c); - }), - ), - ( - "get_tile_counts", - Box::new(|input: Vec<_>| { - let tile_size = max(30_000, cdiv(input.len(), current_num_threads())); - let (c, _) = get_tile_counts(&input, tile_size, 0); - black_box(c); - }), - ), - ( - "get_tile_counts_and_aggregate", - Box::new(|input: Vec<_>| { - let tile_size = max(30_000, cdiv(input.len(), current_num_threads())); - let (c, _) = get_tile_counts(&input, tile_size, 0); - let a = aggregate_tile_counts(&c); - black_box(a); - }), - ), - ]; - - bench_common(c, 0u32, "tune_counts", tests); -} - -criterion_group!(tuning_parameters, tune_counts,); -criterion_main!(tuning_parameters); diff --git a/src/cmd/profiling.rs b/src/cmd/profiling.rs index 2a1db7f..5de799f 100644 --- a/src/cmd/profiling.rs +++ b/src/cmd/profiling.rs @@ -3,29 +3,54 @@ /// /// e.g. /// ``` -/// RUSTFLAGS='--cfg bench --cfg tuning -g -C opt-level=3 -C force-frame-pointers=y -C target-cpu=native -C target-feature=+neon' cargo +nightly instruments -t time --bin profiling --features profiling +/// RUSTFLAGS='--cfg bench --cfg tuning -g -C opt-level=3 -C force-frame-pointers=y -C target-cpu=apple-m1 -C target-feature=+neon' cargo +nightly instruments -t time --bin profiling --features profiling /// ``` #[cfg(not(all(tuning, bench)))] compile_error!("This binary must be run with `RUSTFLAGS='--cfg tuning --cfg bench'`"); +use rdst::tuner::{Algorithm, Tuner, TuningParams}; use rdst::utils::test_utils::gen_inputs; use rdst::RadixSort; use std::thread::sleep; use std::time::{Duration, Instant}; +struct MyTuner {} + +impl Tuner for MyTuner { + fn pick_algorithm(&self, p: &TuningParams, _: &[usize]) -> Algorithm { + if p.input_len < 128 { + return Algorithm::Comparative; + } + + let depth = p.total_levels - p.level - 1; + match depth { + 0 => Algorithm::MtLsb, + _ => Algorithm::Lsb, + } + } +} + fn main() { // Randomly generate an array of // 200_000_000 u64's with half shifted >> 32 and half shifted << 32 - let mut inputs = gen_inputs(200_000_000, 16u32); + let mut inputs = gen_inputs(200_000_000, 0u32); + let mut inputs_2 = gen_inputs(200_000_000, 0u32); // Input generation is multi-threaded and hard to differentiate from the actual // sorting algorithm, depending on the profiler. This makes it more obvious. sleep(Duration::from_millis(300)); + inputs.radix_sort_builder().with_tuner(&MyTuner {}).sort(); + + // A second run, for comparison + sleep(Duration::from_millis(300)); let time = Instant::now(); - inputs.radix_sort_unstable(); + inputs_2.radix_sort_builder().with_tuner(&MyTuner {}).sort(); + + let e = time.elapsed().as_millis(); + println!("Elapsed: {}ms", e); - println!("Elapsed: {}ms", time.elapsed().as_millis()); - println!("{:?}", &inputs[0..5]); + // Ensure nothing gets optimized out + println!("{:?} {:?}", &inputs[0], &inputs_2[0]); } diff --git a/src/cmd/timings.rs b/src/cmd/timings.rs index 3be324f..e5bb042 100644 --- a/src/cmd/timings.rs +++ b/src/cmd/timings.rs @@ -9,7 +9,7 @@ //! You may need to tweak the command below for your own machine. //! //! ``` -//! RUSTFLAGS='--cfg bench --cfg tuning -C opt-level=3 -C target-cpu=native -C target-feature=+neon' cargo +nightly run --bin timings --features timings -- 1234 "Hello world" +//! RUSTFLAGS='--cfg bench --cfg tuning -C opt-level=3 -C target-cpu=apple-m1 -C target-feature=+neon' cargo +nightly run --bin timings --features timings -- 1234 "Hello world" //! ``` //! //! - `1234` is where you place the ID for your run. If you are just running a brief test this can be `N/A`, otherwise it should be something like a commit SHA that you can use to find the code for this run again. diff --git a/src/counts.rs b/src/counts.rs new file mode 100644 index 0000000..ba08747 --- /dev/null +++ b/src/counts.rs @@ -0,0 +1,358 @@ +use std::cell::RefCell; + +use std::ops::{Index, IndexMut}; + +use crate::RadixKey; +use bumpalo::Bump; +use std::rc::Rc; +use std::slice::{Iter, SliceIndex}; + +#[derive(Default)] +pub struct CountManager {} + +#[repr(C, align(4096))] +#[derive(Clone)] +pub struct Counter([usize; 256 * 4]); + +impl Default for Counter { + fn default() -> Self { + Counter([0usize; 256 * 4]) + } +} + +#[repr(C, align(2048))] +#[derive(Clone)] +pub struct Counts([usize; 256]); +pub type PrefixSums = Counts; +pub type EndOffsets = Counts; + +impl Index for Counts +where + I: SliceIndex<[usize]>, +{ + type Output = I::Output; + + #[inline(always)] + fn index(&self, index: I) -> &I::Output { + &self.0[index] + } +} + +impl IndexMut for Counts +where + I: SliceIndex<[usize]>, +{ + #[inline(always)] + fn index_mut(&mut self, index: I) -> &mut I::Output { + &mut self.0[index] + } +} + +impl Default for Counts { + fn default() -> Self { + Counts([0usize; 256]) + } +} + +#[derive(Default, Clone, Copy)] +pub struct CountMeta { + pub first: u8, + pub last: u8, + pub already_sorted: bool, +} + +#[derive(Default)] +struct ThreadContext { + pub counter: RefCell, + pub counts: RefCell>>>, + pub bump: Bump, +} + +impl CountManager { + thread_local! { + static THREAD_CTX: ThreadContext = Default::default(); + } + + #[inline(always)] + pub fn get_empty_counts(&self) -> Rc> { + if let Some(counts) = Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().pop()) { + counts + } else { + Default::default() + } + } + + #[inline(always)] + pub fn return_counts(&self, counts: Rc>) { + counts.borrow_mut().clear(); + Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().push(counts)); + } + + #[inline(always)] + pub fn count_into( + &self, + counts: &mut Counts, + meta: &mut CountMeta, + bucket: &[T], + level: usize, + ) { + Self::THREAD_CTX.with(|ct| { + ct.counter + .borrow_mut() + .count_into(counts, meta, bucket, level) + }) + } + + #[inline(always)] + pub fn counts(&self, bucket: &[T], level: usize) -> (Rc>, bool) { + let counts = self.get_empty_counts(); + let mut meta = CountMeta::default(); + Self::THREAD_CTX.with(|ct| { + ct.counter + .borrow_mut() + .count_into(&mut counts.borrow_mut(), &mut meta, bucket, level) + }); + + (counts, meta.already_sorted) + } + + #[inline(always)] + pub fn prefix_sums(&self, counts: &Counts) -> Rc> { + let sums = self.get_empty_counts(); + let mut s = sums.borrow_mut(); + + let mut running_total = 0; + for (i, c) in counts.into_iter().enumerate() { + s[i] = running_total; + running_total += c; + } + drop(s); + + sums + } + + #[inline(always)] + pub fn end_offsets( + &self, + counts: &Counts, + prefix_sums: &PrefixSums, + ) -> Rc> { + let end_offsets = self.get_empty_counts(); + let mut eo = end_offsets.borrow_mut(); + + eo[0..255].copy_from_slice(&prefix_sums[1..256]); + eo[255] = counts[255] + prefix_sums[255]; + drop(eo); + + end_offsets + } + + #[inline(always)] + pub fn with_tmp_buffer( + &self, + len: usize, + mut f: F, + ) { + Self::THREAD_CTX.with(|ct| { + let mut tmp = bumpalo::collections::Vec::with_capacity_in(len, &ct.bump); + // Safety: It's up to the caller to ensure that all values in the tmp buffer are overwritten before use + unsafe { + tmp.set_len(len); + } + f(self, &mut tmp); + drop(tmp); + }) + } +} + +impl Counter { + #[inline(always)] + fn clear(&mut self) { + self.0.iter_mut().for_each(|x| *x = 0); + } + + #[inline(always)] + pub fn count_into( + &mut self, + counts: &mut Counts, + meta: &mut CountMeta, + bucket: &[T], + level: usize, + ) { + #[cfg(feature = "work_profiles")] + println!("({}) COUNT", level); + + self.clear(); + counts.clear(); + + if bucket.is_empty() { + meta.first = 0; + meta.last = 0; + meta.already_sorted = true; + return; + } else if bucket.len() == 1 { + let b = bucket[0].get_level(level) as usize; + counts.inc(b); + + meta.first = b as u8; + meta.last = b as u8; + meta.already_sorted = true; + return; + } + + let mut already_sorted = true; + let first = bucket.first().unwrap().get_level(level); + let last = bucket.last().unwrap().get_level(level); + + let mut continue_from = bucket.len(); + let mut prev = 0usize; + + // First, count directly into the output buffer until we find a value that is out of order. + for (i, item) in bucket.iter().enumerate() { + let b = item.get_level(level) as usize; + counts.inc(b); + + if b < prev { + continue_from = i + 1; + already_sorted = false; + break; + } + + prev = b; + } + + if continue_from == bucket.len() { + meta.first = first; + meta.last = last; + meta.already_sorted = already_sorted; + return; + } + + let chunks = bucket[continue_from..].chunks_exact(4); + let rem = chunks.remainder(); + + chunks.into_iter().for_each(|chunk| { + let a = chunk[0].get_level(level) as usize; + let b = chunk[1].get_level(level) as usize; + let c = chunk[2].get_level(level) as usize; + let d = chunk[3].get_level(level) as usize; + + self.0[a * 4] += 1; + self.0[1 + b * 4] += 1; + self.0[2 + c * 4] += 1; + self.0[3 + d * 4] += 1; + }); + + rem.iter().for_each(|v| { + let b = v.get_level(level) as usize; + counts.inc(b); + }); + + for i in 0..256 { + let agg = self.0[i * 4] + self.0[1 + i * 4] + self.0[2 + i * 4] + self.0[3 + i * 4]; + counts.add(i, agg); + } + + meta.first = first; + meta.last = last; + meta.already_sorted = already_sorted; + } +} + +pub struct CountIter<'a>(&'a Counts, usize); + +pub struct CountIterEnumerable<'a>(&'a mut CountIter<'a>); + +impl<'a> CountIter<'a> { + #[inline(always)] + pub fn enumerate(&'a mut self) -> CountIterEnumerable<'a> { + CountIterEnumerable(self) + } +} + +impl Counts { + #[inline(always)] + pub fn clear(&mut self) { + self.0.iter_mut().for_each(|x| *x = 0); + } + + #[inline(always)] + pub fn get_count(self, radix: usize) -> usize { + debug_assert!(radix < 256); + unsafe { *self.0.get_unchecked(radix) } + } + + #[inline(always)] + pub fn inc(&mut self, radix: usize) { + debug_assert!(radix < 256); + unsafe { + *self.0.get_unchecked_mut(radix) += 1; + } + } + + #[inline(always)] + pub fn add(&mut self, radix: usize, count: usize) { + debug_assert!(radix < 256); + unsafe { + *self.0.get_unchecked_mut(radix) += count; + } + } + + #[inline(always)] + pub fn new() -> Self { + Self::default() + } + + #[inline] + pub fn inner(&self) -> &[usize; 256] { + &self.0 + } +} + +impl Iterator for CountIter<'_> { + type Item = usize; + + #[inline(always)] + fn next(&mut self) -> Option { + if self.1 == 256 { + return None; + } + + let out = self.0[self.1]; + self.1 += 1; + + Some(out) + } + + #[inline(always)] + fn size_hint(&self) -> (usize, Option) { + (256 - self.1, Some(256 - self.1)) + } +} + +impl ExactSizeIterator for CountIter<'_> { + #[inline(always)] + fn len(&self) -> usize { + 256 - self.1 + } +} + +impl IntoIterator for Counts { + type Item = usize; + type IntoIter = core::array::IntoIter; + + #[inline(always)] + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +impl<'a> IntoIterator for &'a Counts { + type Item = &'a usize; + type IntoIter = Iter<'a, usize>; + + #[inline(always)] + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} diff --git a/src/lib.rs b/src/lib.rs index bd78e74..3c97a1c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -138,6 +138,7 @@ //! ``` //! use rdst::RadixSort; //! use rdst::tuner::{Algorithm, Tuner, TuningParams}; +//! use rdst::counts::Counts; //! //! struct MyTuner; //! @@ -171,6 +172,8 @@ //! //! Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. +extern crate core; + mod radix_key; mod radix_key_impl; mod radix_sort_builder; @@ -190,6 +193,7 @@ mod sorter; mod tuners; // Public modules +pub mod counts; pub mod tuner; // Public exports diff --git a/src/radix_sort_builder.rs b/src/radix_sort_builder.rs index feea3cd..f4f1205 100644 --- a/src/radix_sort_builder.rs +++ b/src/radix_sort_builder.rs @@ -104,6 +104,7 @@ where /// ``` /// use rdst::RadixSort; /// use rdst::tuner::{Algorithm, Tuner, TuningParams}; + /// use rdst::counts::Counts; /// /// struct MyTuner; /// diff --git a/src/sorter.rs b/src/sorter.rs index 5ee7140..78671be 100644 --- a/src/sorter.rs +++ b/src/sorter.rs @@ -1,3 +1,4 @@ +use crate::counts::{CountManager, Counts}; use crate::tuner::{Algorithm, Tuner, TuningParams}; use crate::utils::*; use crate::RadixKey; @@ -6,11 +7,14 @@ use arbitrary_chunks::ArbitraryChunks; use rayon::current_num_threads; #[cfg(feature = "multi-threaded")] use rayon::prelude::*; +use std::cell::RefCell; use std::cmp::max; +use std::rc::Rc; pub struct Sorter<'a> { multi_threaded: bool, pub(crate) tuner: &'a (dyn Tuner + Send + Sync), + pub(crate) cm: CountManager, } impl<'a> Sorter<'a> { @@ -18,6 +22,7 @@ impl<'a> Sorter<'a> { Self { multi_threaded, tuner, + cm: CountManager::default(), } } @@ -26,47 +31,59 @@ impl<'a> Sorter<'a> { &self, level: usize, bucket: &mut [T], - counts: &[usize; 256], - tile_counts: Option>, + counts: Rc>, + tile_counts: Option>, #[allow(unused)] tile_size: usize, algorithm: Algorithm, ) where - T: RadixKey + Copy + Sized + Send + Sync, + T: RadixKey + Copy + Sized + Send + Sync + 'a, { - #[allow(unused)] - if let Some(tile_counts) = tile_counts { - match algorithm { - #[cfg(feature = "multi-threaded")] - Algorithm::Scanning => self.scanning_sort_adapter(bucket, counts, level), - #[cfg(feature = "multi-threaded")] - Algorithm::Recombinating => { - self.recombinating_sort_adapter(bucket, counts, &tile_counts, tile_size, level) - } - Algorithm::LrLsb => self.lsb_sort_adapter(true, bucket, counts, 0, level), - Algorithm::Lsb => self.lsb_sort_adapter(false, bucket, counts, 0, level), - Algorithm::Ska => self.ska_sort_adapter(bucket, counts, level), - Algorithm::Comparative => self.comparative_sort(bucket, level), - #[cfg(feature = "multi-threaded")] - Algorithm::Regions => { - self.regions_sort_adapter(bucket, counts, &tile_counts, tile_size, level) + if cfg!(feature = "multi-threaded") { + if let Some(tc) = tile_counts { + match algorithm { + Algorithm::MtOop => { + self.mt_oop_sort_adapter(bucket, level, counts, tc, tile_size) + } + Algorithm::Recombinating => { + self.recombinating_sort_adapter(bucket, counts, tc, tile_size, level) + } + Algorithm::Regions => { + self.regions_sort_adapter(bucket, counts, tc, tile_size, level) + } + _ => match algorithm { + Algorithm::MtLsb => self.mt_lsb_sort_adapter(bucket, 0, level, tile_size), + Algorithm::Scanning => self.scanning_sort_adapter(bucket, counts, level), + Algorithm::Comparative => self.comparative_sort(bucket, level), + Algorithm::LrLsb => self.lsb_sort_adapter(true, bucket, counts, 0, level), + Algorithm::Lsb => self.lsb_sort_adapter(false, bucket, counts, 0, level), + Algorithm::Ska => self.ska_sort_adapter(bucket, counts, level), + _ => panic!( + "Bad algorithm: {:?} with unused tc for len: {}", + algorithm, + bucket.len() + ), + }, } - #[cfg(feature = "multi-threaded")] - Algorithm::MtOop => { - self.mt_oop_sort_adapter(bucket, level, counts, &tile_counts, tile_size) + } else { + match algorithm { + Algorithm::MtLsb => self.mt_lsb_sort_adapter(bucket, 0, level, tile_size), + Algorithm::Comparative => self.comparative_sort(bucket, level), + Algorithm::LrLsb => self.lsb_sort_adapter(true, bucket, counts, 0, level), + Algorithm::Lsb => self.lsb_sort_adapter(false, bucket, counts, 0, level), + Algorithm::Ska => self.ska_sort_adapter(bucket, counts, level), + Algorithm::Scanning => self.scanning_sort_adapter(bucket, counts, level), + _ => panic!("Bad algorithm: {:?} for len: {}", algorithm, bucket.len()), } - #[cfg(feature = "multi-threaded")] - Algorithm::MtLsb => self.mt_lsb_sort_adapter(bucket, 0, level, tile_size), } } else { match algorithm { - #[cfg(feature = "multi-threaded")] - Algorithm::Scanning => self.scanning_sort_adapter(bucket, counts, level), Algorithm::LrLsb => self.lsb_sort_adapter(true, bucket, counts, 0, level), Algorithm::Lsb => self.lsb_sort_adapter(false, bucket, counts, 0, level), Algorithm::Ska => self.ska_sort_adapter(bucket, counts, level), Algorithm::Comparative => self.comparative_sort(bucket, level), - #[cfg(feature = "multi-threaded")] - e => panic!("Bad algorithm: {:?} for len: {}", e, bucket.len()), + // XXX: The compiler currently doesn't recognize that the other options are not available due to the + // missing feature flag, so we need to add a catch-all here. + _ => panic!("Bad algorithm: {:?} for len: {}", algorithm, bucket.len()), } } } @@ -78,7 +95,7 @@ impl<'a> Sorter<'a> { parent_len: Option, threads: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if chunk.len() <= 1 { return; @@ -102,33 +119,33 @@ impl<'a> Sorter<'a> { parent_len, }; - let mut tile_counts: Option> = None; + let mut tile_counts: Option> = None; let mut already_sorted = false; if use_tiles { - let (tc, s) = get_tile_counts(chunk, tile_size, level); + let (tc, s) = get_tile_counts(&self.cm, chunk, tile_size, level); tile_counts = Some(tc); already_sorted = s; } let counts = if let Some(tile_counts) = &tile_counts { - aggregate_tile_counts(tile_counts) + aggregate_tile_counts(&self.cm, tile_counts) } else { - let (counts, s) = get_counts(chunk, level); - already_sorted = s; + let (rc, ra) = self.cm.counts(chunk, level); + already_sorted = ra; - counts + rc }; - if already_sorted || (chunk.len() >= 30_000 && is_homogenous_bucket(&counts)) { + if already_sorted || (chunk.len() >= 30_000 && is_homogenous(&counts.borrow())) { if level != 0 { - self.director(chunk, &counts, level - 1); + self.director(chunk, counts, level - 1); } return; } - let algorithm = self.tuner.pick_algorithm(&tp, &counts); + let algorithm = self.tuner.pick_algorithm(&tp, counts.borrow().inner()); // Ensure tile_counts is always set when it is required if tile_counts.is_none() { @@ -137,7 +154,7 @@ impl<'a> Sorter<'a> { Algorithm::MtOop | Algorithm::MtLsb | Algorithm::Recombinating - | Algorithm::Regions => Some(vec![counts]), + | Algorithm::Regions => Some(vec![counts.borrow().clone()]), _ => None, }; } @@ -145,13 +162,13 @@ impl<'a> Sorter<'a> { #[cfg(feature = "work_profiles")] println!("({}) PAR: {:?}", level, algorithm); - self.run_sort(level, chunk, &counts, tile_counts, tile_size, algorithm); + self.run_sort(level, chunk, counts, tile_counts, tile_size, algorithm); } #[inline] pub fn top_level_director(&self, bucket: &mut [T]) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { #[cfg(feature = "multi-threaded")] let threads = current_num_threads(); @@ -166,36 +183,91 @@ impl<'a> Sorter<'a> { #[inline] #[cfg(feature = "multi-threaded")] - pub fn multi_threaded_director(&self, bucket: &mut [T], counts: &[usize; 256], level: usize) - where - T: RadixKey + Send + Copy + Sync, + pub fn multi_threaded_director( + &self, + bucket: &'a mut [T], + counts: Rc>, + level: usize, + ) where + T: 'a + RadixKey + Send + Copy + Sync, { let parent_len = Some(bucket.len()); let threads = current_num_threads(); - bucket - .arbitrary_chunks_mut(counts) - .par_bridge() - .for_each(|chunk| self.handle_chunk(chunk, level, parent_len, threads)); + let segment_size = cdiv(bucket.len(), threads); + + let mut running_total = 0; + let mut radix_start = 255; + let mut radix_end = 255; + let mut finished = false; + + let cbb = counts.borrow(); + let cb = cbb.inner(); + + let mut bucket: &'a mut [T] = bucket; + let mut jobs: Vec<(&'a mut [T], &[usize])> = Vec::with_capacity(threads); + + 'outer: for _ in 0..threads { + loop { + running_total += cb[radix_start]; + + if finished { + break 'outer; + } else if radix_start == 0 { + let b: &'a mut [T] = std::mem::take(&mut bucket); + finished = true; + jobs.push((b, &cb[radix_start..=radix_end])); + continue 'outer; + } else if running_total >= segment_size { + let b: &'a mut [T] = std::mem::take(&mut bucket); + let (rest, seg) = b.split_at_mut(b.len() - running_total); + bucket = rest; + let ret = (seg, &cb[radix_start..=radix_end]); + + radix_start -= 1; + radix_end = radix_start; + running_total = 0; + + jobs.push(ret); + continue 'outer; + } else { + radix_start -= 1; + } + } + } + + jobs.into_par_iter().for_each(|(seg, c)| { + seg.arbitrary_chunks_mut(c) + .for_each(|chunk| self.handle_chunk(chunk, level, parent_len, threads)); + }); + + drop(cbb); + self.cm.return_counts(counts); } #[inline] - pub fn single_threaded_director(&self, bucket: &mut [T], counts: &[usize; 256], level: usize) - where - T: RadixKey + Send + Sync + Copy, + pub fn single_threaded_director( + &self, + bucket: &mut [T], + counts: Rc>, + level: usize, + ) where + T: RadixKey + Send + Sync + Copy + 'a, { let parent_len = Some(bucket.len()); let threads = 1; bucket - .arbitrary_chunks_mut(counts) + .arbitrary_chunks_mut(counts.borrow().inner()) .for_each(|chunk| self.handle_chunk(chunk, level, parent_len, threads)); + + self.cm.return_counts(counts); } #[inline] - pub fn director(&self, bucket: &mut [T], counts: &[usize; 256], level: usize) + pub fn director(&self, bucket: &mut [T], counts: Rc>, level: usize) where - T: RadixKey + Send + Sync + Copy, + T: RadixKey + Send + Sync + Copy + 'a, { if cfg!(feature = "multi-threaded") && self.multi_threaded { #[cfg(feature = "multi-threaded")] diff --git a/src/sorts/comparative_sort.rs b/src/sorts/comparative_sort.rs index 1ab180c..5bc2641 100644 --- a/src/sorts/comparative_sort.rs +++ b/src/sorts/comparative_sort.rs @@ -28,7 +28,7 @@ use std::cmp::Ordering; impl<'a> Sorter<'a> { pub(crate) fn comparative_sort(&self, bucket: &mut [T], start_level: usize) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; @@ -64,9 +64,8 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { + let sorter = Sorter::new(true, &StandardTuner); sorter.comparative_sort(inputs, T::LEVELS - 1); }); } @@ -108,9 +107,8 @@ mod tests { #[test] pub fn test_u32_patterns() { - let sorter = Sorter::new(true, &StandardTuner); - validate_u32_patterns(|inputs| { + let sorter = Sorter::new(true, &StandardTuner); sorter.comparative_sort(inputs, u32::LEVELS - 1); }); } diff --git a/src/sorts/lsb_sort.rs b/src/sorts/lsb_sort.rs index 367e986..1804a7d 100644 --- a/src/sorts/lsb_sort.rs +++ b/src/sorts/lsb_sort.rs @@ -33,7 +33,10 @@ use crate::sorts::out_of_place_sort::{ lr_out_of_place_sort, lr_out_of_place_sort_with_counts, out_of_place_sort, out_of_place_sort_with_counts, }; -use crate::utils::*; +use std::cell::RefCell; +use std::rc::Rc; + +use crate::counts::{CountMeta, Counts}; use crate::RadixKey; impl<'a> Sorter<'a> { @@ -41,97 +44,145 @@ impl<'a> Sorter<'a> { &self, lr: bool, bucket: &mut [T], - last_counts: &[usize; 256], + last_counts: Rc>, start_level: usize, end_level: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; } - let mut tmp_bucket = get_tmp_bucket(bucket.len()); - let levels: Vec = (start_level..=end_level).collect(); - let mut invert = false; - let mut next_counts = None; - - 'outer: for level in levels { - let counts = if level == end_level { - *last_counts - } else if let Some(next_counts) = next_counts { - next_counts - } else { - let (counts, already_sorted) = get_counts(bucket, level); - if already_sorted { - next_counts = None; - continue 'outer; - } - - counts - }; - - for c in counts.iter() { - if *c == bucket.len() { - next_counts = None; - continue 'outer; - } else if *c > 0 { - break; - } - } + self.cm.with_tmp_buffer(bucket.len(), |cm, tmp_bucket| { + let mut invert = false; + let mut use_next_counts = false; + let mut counts = cm.get_empty_counts(); + let mut meta = CountMeta::default(); + let mut next_counts: Rc> = cm.get_empty_counts(); + + for level in start_level..=end_level { + if level == end_level { + cm.return_counts(counts); + counts = last_counts.clone(); + } else if use_next_counts { + counts.borrow_mut().clear(); + (counts, next_counts) = (next_counts, counts); + } else { + let mut c_mut = counts.borrow_mut(); + c_mut.clear(); + cm.count_into(&mut c_mut, &mut meta, bucket, level); + drop(c_mut); + next_counts.borrow_mut().clear(); + + if meta.already_sorted { + use_next_counts = false; + continue; + } + }; + + let counts = counts.borrow(); + let sums_rc = cm.prefix_sums(&counts); + let mut sums = sums_rc.borrow_mut(); + let should_count = end_level != 0 && level < (end_level - 1); + use_next_counts = should_count; + + match (lr, invert, should_count) { + (true, true, true) => { + let ends = cm.end_offsets(&counts, &sums); + let scratch_counts = cm.get_empty_counts(); + lr_out_of_place_sort_with_counts( + tmp_bucket, + bucket, + level, + &mut sums, + &mut ends.borrow_mut(), + &mut next_counts.borrow_mut(), + &mut scratch_counts.borrow_mut(), + ); + cm.return_counts(ends); + cm.return_counts(scratch_counts); + } + (true, true, false) => { + let ends = cm.end_offsets(&counts, &sums); + lr_out_of_place_sort( + tmp_bucket, + bucket, + level, + &mut sums, + &mut ends.borrow_mut(), + ); + cm.return_counts(ends); + } + (true, false, true) => { + let ends = cm.end_offsets(&counts, &sums); + let scratch_counts = cm.get_empty_counts(); + lr_out_of_place_sort_with_counts( + bucket, + tmp_bucket, + level, + &mut sums, + &mut ends.borrow_mut(), + &mut next_counts.borrow_mut(), + &mut scratch_counts.borrow_mut(), + ); + cm.return_counts(ends); + cm.return_counts(scratch_counts); + } + (true, false, false) => { + let ends = cm.end_offsets(&counts, &sums); + lr_out_of_place_sort( + bucket, + tmp_bucket, + level, + &mut sums, + &mut ends.borrow_mut(), + ); + cm.return_counts(ends); + } + (false, true, true) => { + let scratch_counts = cm.get_empty_counts(); + out_of_place_sort_with_counts( + tmp_bucket, + bucket, + level, + &mut sums, + &mut next_counts.borrow_mut(), + &mut scratch_counts.borrow_mut(), + ); + cm.return_counts(scratch_counts); + } + (false, true, false) => out_of_place_sort(tmp_bucket, bucket, level, &mut sums), + (false, false, true) => { + let scratch_counts = cm.get_empty_counts(); + out_of_place_sort_with_counts( + bucket, + tmp_bucket, + level, + &mut sums, + &mut next_counts.borrow_mut(), + &mut scratch_counts.borrow_mut(), + ); + cm.return_counts(scratch_counts); + } + (false, false, false) => { + out_of_place_sort(bucket, tmp_bucket, level, &mut sums) + } + }; + + drop(sums); + cm.return_counts(sums_rc); - let should_count = end_level != 0 && level < (end_level - 1); - if !should_count { - next_counts = None; + invert = !invert; } - match (lr, invert, should_count) { - (true, true, true) => { - next_counts = Some(lr_out_of_place_sort_with_counts( - &tmp_bucket, - bucket, - &counts, - level, - )) - } - (true, true, false) => lr_out_of_place_sort(&tmp_bucket, bucket, &counts, level), - (true, false, true) => { - next_counts = Some(lr_out_of_place_sort_with_counts( - bucket, - &mut tmp_bucket, - &counts, - level, - )) - } - (true, false, false) => { - lr_out_of_place_sort(bucket, &mut tmp_bucket, &counts, level) - } - (false, true, true) => { - next_counts = Some(out_of_place_sort_with_counts( - &tmp_bucket, - bucket, - &counts, - level, - )) - } - (false, true, false) => out_of_place_sort(&tmp_bucket, bucket, &counts, level), - (false, false, true) => { - next_counts = Some(out_of_place_sort_with_counts( - bucket, - &mut tmp_bucket, - &counts, - level, - )) - } - (false, false, false) => out_of_place_sort(bucket, &mut tmp_bucket, &counts, level), - }; - - invert = !invert; - } + cm.return_counts(counts); + cm.return_counts(next_counts); - if invert { - bucket.copy_from_slice(&tmp_bucket); - } + if invert { + bucket.copy_from_slice(tmp_bucket); + } + }); } } @@ -140,7 +191,6 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::get_counts; use crate::utils::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; @@ -150,18 +200,18 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { - let (counts, _) = get_counts(inputs, T::LEVELS - 1); + let sorter = Sorter::new(true, &StandardTuner); + let (counts, _) = sorter.cm.counts(inputs, T::LEVELS - 1); - sorter.lsb_sort_adapter(false, inputs, &counts, 0, T::LEVELS - 1) + sorter.lsb_sort_adapter(false, inputs, counts, 0, T::LEVELS - 1) }); sort_comparison_suite(shift, |inputs| { - let (counts, _) = get_counts(inputs, T::LEVELS - 1); + let sorter = Sorter::new(true, &StandardTuner); + let (counts, _) = sorter.cm.counts(inputs, T::LEVELS - 1); - sorter.lsb_sort_adapter(true, inputs, &counts, 0, T::LEVELS - 1); + sorter.lsb_sort_adapter(true, inputs, counts, 0, T::LEVELS - 1); }); } @@ -209,9 +259,9 @@ mod tests { pub fn test_u32_patterns() { validate_u32_patterns(|inputs| { let sorter = Sorter::new(true, &StandardTuner); - let (counts, _) = get_counts(inputs, u32::LEVELS - 1); + let (counts, _) = sorter.cm.counts(inputs, u32::LEVELS - 1); - sorter.lsb_sort_adapter(true, inputs, &counts, 0, u32::LEVELS - 1); + sorter.lsb_sort_adapter(true, inputs, counts, 0, u32::LEVELS - 1); }); } } diff --git a/src/sorts/mod.rs b/src/sorts/mod.rs index 4d13a5b..795bde1 100644 --- a/src/sorts/mod.rs +++ b/src/sorts/mod.rs @@ -10,16 +10,3 @@ mod regions_sort; #[cfg(feature = "multi-threaded")] mod scanning_sort; mod ska_sort; - -pub use comparative_sort::*; -pub use lsb_sort::*; -#[cfg(feature = "multi-threaded")] -pub use mt_lsb_sort::*; -pub use out_of_place_sort::*; -#[cfg(feature = "multi-threaded")] -pub use recombinating_sort::*; -#[cfg(feature = "multi-threaded")] -pub use regions_sort::*; -#[cfg(feature = "multi-threaded")] -pub use scanning_sort::*; -pub use ska_sort::*; diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index e84b8a5..fc5f533 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -28,16 +28,19 @@ //! //! This variant uses the same algorithm as `mt_lsb_sort` but uses it in msb-first order. +use crate::counts::Counts; use crate::sorter::Sorter; use crate::utils::*; use crate::RadixKey; use arbitrary_chunks::ArbitraryChunks; use rayon::prelude::*; +use std::cell::RefCell; +use std::rc::Rc; pub fn mt_lsb_sort( src_bucket: &mut [T], dst_bucket: &mut [T], - tile_counts: &[[usize; 256]], + tile_counts: &[Counts], tile_size: usize, level: usize, ) where @@ -142,71 +145,71 @@ impl<'a> Sorter<'a> { end_level: usize, tile_size: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; } - let mut tmp_bucket = get_tmp_bucket(bucket.len()); - let levels: Vec = (start_level..=end_level).collect(); - let mut invert = false; + self.cm.with_tmp_buffer(bucket.len(), |_, tmp_bucket| { + let levels: Vec = (start_level..=end_level).collect(); + let mut invert = false; - for level in levels { - let (tile_counts, already_sorted) = if invert { - get_tile_counts(&tmp_bucket, tile_size, level) - } else { - get_tile_counts(bucket, tile_size, level) - }; + for level in levels { + let (tile_counts, already_sorted) = if invert { + get_tile_counts(&self.cm, tmp_bucket, tile_size, level) + } else { + get_tile_counts(&self.cm, bucket, tile_size, level) + }; - if already_sorted { - continue; - } + if already_sorted { + continue; + } - if invert { - mt_lsb_sort(&mut tmp_bucket, bucket, &tile_counts, tile_size, level) - } else { - mt_lsb_sort(bucket, &mut tmp_bucket, &tile_counts, tile_size, level) - }; + if invert { + mt_lsb_sort(tmp_bucket, bucket, &tile_counts, tile_size, level) + } else { + mt_lsb_sort(bucket, tmp_bucket, &tile_counts, tile_size, level) + }; - invert = !invert; - } + invert = !invert; + } - if invert { - bucket - .par_chunks_mut(tile_size) - .zip(tmp_bucket.par_chunks(tile_size)) - .for_each(|(chunk, tmp_chunk)| { - chunk.copy_from_slice(tmp_chunk); - }); - } + if invert { + bucket + .par_chunks_mut(tile_size) + .zip(tmp_bucket.par_chunks(tile_size)) + .for_each(|(chunk, tmp_chunk)| { + chunk.copy_from_slice(tmp_chunk); + }); + } + }); } pub(crate) fn mt_oop_sort_adapter( &self, bucket: &mut [T], level: usize, - counts: &[usize; 256], - tile_counts: &[[usize; 256]], + counts: Rc>, + tile_counts: Vec, tile_size: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() <= 1 { return; } - let mut tmp_bucket = get_tmp_bucket(bucket.len()); - mt_lsb_sort(bucket, &mut tmp_bucket, tile_counts, tile_size, level); - - bucket - .par_chunks_mut(tile_size) - .zip(tmp_bucket.par_chunks(tile_size)) - .for_each(|(chunk, tmp_chunk)| { - chunk.copy_from_slice(tmp_chunk); - }); + self.cm.with_tmp_buffer(bucket.len(), |_, tmp_bucket| { + mt_lsb_sort(bucket, tmp_bucket, &tile_counts, tile_size, level); - drop(tmp_bucket); + bucket + .par_chunks_mut(tile_size) + .zip(tmp_bucket.par_chunks(tile_size)) + .for_each(|(chunk, tmp_chunk)| { + chunk.copy_from_slice(tmp_chunk); + }); + }); self.director(bucket, counts, level - 1); } @@ -228,13 +231,12 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { if inputs.len() == 0 { return; } + let sorter = Sorter::new(true, &StandardTuner); let tile_size = cdiv(inputs.len(), current_num_threads()); sorter.mt_lsb_sort_adapter(inputs, 0, T::LEVELS - 1, tile_size); diff --git a/src/sorts/out_of_place_sort.rs b/src/sorts/out_of_place_sort.rs index cf305e3..b25f698 100644 --- a/src/sorts/out_of_place_sort.rs +++ b/src/sorts/out_of_place_sort.rs @@ -43,15 +43,15 @@ //! * single-threaded //! * lsb-first -use crate::utils::*; +use crate::counts::{Counts, EndOffsets, PrefixSums}; use crate::RadixKey; #[inline] pub fn out_of_place_sort( src_bucket: &[T], dst_bucket: &mut [T], - counts: &[usize; 256], level: usize, + prefix_sums: &mut PrefixSums, ) where T: RadixKey + Sized + Send + Copy + Sync, { @@ -60,8 +60,6 @@ pub fn out_of_place_sort( return; } - let mut prefix_sums = get_prefix_sums(counts); - let chunks = src_bucket.chunks_exact(8); let rem = chunks.remainder(); @@ -104,25 +102,22 @@ pub fn out_of_place_sort( pub fn out_of_place_sort_with_counts( src_bucket: &[T], dst_bucket: &mut [T], - counts: &[usize; 256], level: usize, -) -> [usize; 256] -where + prefix_sums: &mut PrefixSums, + next_counts: &mut Counts, + scratch_counts: &mut Counts, +) where T: RadixKey + Sized + Send + Copy + Sync, { if src_bucket.is_empty() { - return [0usize; 256]; + return; } else if src_bucket.len() == 1 { - let mut counts = [0usize; 256]; dst_bucket.copy_from_slice(src_bucket); - counts[src_bucket[0].get_level(level) as usize] = 1; - return counts; + next_counts[src_bucket[0].get_level(level) as usize] = 1; + return; } let next_level = level + 1; - let mut prefix_sums = get_prefix_sums(counts); - let mut next_counts_0 = [0usize; 256]; - let mut next_counts_1 = [0usize; 256]; let chunks = src_bucket.chunks_exact(8); let rem = chunks.remainder(); @@ -147,28 +142,28 @@ where dst_bucket[prefix_sums[b0]] = chunk[0]; prefix_sums[b0] += 1; - next_counts_0[bn0] += 1; + next_counts[bn0] += 1; dst_bucket[prefix_sums[b1]] = chunk[1]; prefix_sums[b1] += 1; - next_counts_1[bn1] += 1; + scratch_counts[bn1] += 1; dst_bucket[prefix_sums[b2]] = chunk[2]; prefix_sums[b2] += 1; - next_counts_0[bn2] += 1; + next_counts[bn2] += 1; dst_bucket[prefix_sums[b3]] = chunk[3]; prefix_sums[b3] += 1; - next_counts_1[bn3] += 1; + scratch_counts[bn3] += 1; dst_bucket[prefix_sums[b4]] = chunk[4]; prefix_sums[b4] += 1; - next_counts_0[bn4] += 1; + next_counts[bn4] += 1; dst_bucket[prefix_sums[b5]] = chunk[5]; prefix_sums[b5] += 1; - next_counts_1[bn5] += 1; + scratch_counts[bn5] += 1; dst_bucket[prefix_sums[b6]] = chunk[6]; prefix_sums[b6] += 1; - next_counts_0[bn6] += 1; + next_counts[bn6] += 1; dst_bucket[prefix_sums[b7]] = chunk[7]; prefix_sums[b7] += 1; - next_counts_1[bn7] += 1; + scratch_counts[bn7] += 1; }); rem.iter().for_each(|val| { @@ -176,22 +171,21 @@ where let bn = val.get_level(next_level) as usize; dst_bucket[prefix_sums[b]] = *val; prefix_sums[b] += 1; - next_counts_0[bn] += 1; + next_counts[bn] += 1; }); for i in 0..256 { - next_counts_0[i] += next_counts_1[i]; + next_counts[i] += scratch_counts[i]; } - - next_counts_0 } #[inline] pub fn lr_out_of_place_sort( src_bucket: &[T], dst_bucket: &mut [T], - counts: &[usize; 256], level: usize, + prefix_sums: &mut PrefixSums, + ends: &mut EndOffsets, ) where T: RadixKey + Sized + Send + Copy + Sync, { @@ -200,13 +194,6 @@ pub fn lr_out_of_place_sort( return; } - let mut offsets = get_prefix_sums(counts); - let mut ends = [0usize; 256]; - - for (i, b) in offsets.iter().enumerate() { - ends[i] = b + counts[i].saturating_sub(1); - } - let mut left = 0; let mut right = src_bucket.len() - 1; let pre = src_bucket.len() % 8; @@ -214,8 +201,8 @@ pub fn lr_out_of_place_sort( for _ in 0..pre { let b = src_bucket[right].get_level(level) as usize; - dst_bucket[ends[b]] = src_bucket[right]; ends[b] = ends[b].saturating_sub(1); + dst_bucket[ends[b]] = src_bucket[right]; right = right.saturating_sub(1); } @@ -235,22 +222,22 @@ pub fn lr_out_of_place_sort( let br_2 = src_bucket[right - 2].get_level(level) as usize; let br_3 = src_bucket[right - 3].get_level(level) as usize; - dst_bucket[offsets[bl_0]] = src_bucket[left]; - offsets[bl_0] = offsets[bl_0].wrapping_add(1); + dst_bucket[prefix_sums[bl_0]] = src_bucket[left]; + prefix_sums[bl_0] = prefix_sums[bl_0].wrapping_add(1); + ends[br_0] = ends[br_0].saturating_sub(1); dst_bucket[ends[br_0]] = src_bucket[right]; - ends[br_0] = ends[br_0].wrapping_sub(1); - dst_bucket[offsets[bl_1]] = src_bucket[left + 1]; - offsets[bl_1] = offsets[bl_1].wrapping_add(1); + dst_bucket[prefix_sums[bl_1]] = src_bucket[left + 1]; + prefix_sums[bl_1] = prefix_sums[bl_1].wrapping_add(1); + ends[br_1] = ends[br_1].saturating_sub(1); dst_bucket[ends[br_1]] = src_bucket[right - 1]; - ends[br_1] = ends[br_1].wrapping_sub(1); - dst_bucket[offsets[bl_2]] = src_bucket[left + 2]; - offsets[bl_2] = offsets[bl_2].wrapping_add(1); + dst_bucket[prefix_sums[bl_2]] = src_bucket[left + 2]; + prefix_sums[bl_2] = prefix_sums[bl_2].wrapping_add(1); + ends[br_2] = ends[br_2].saturating_sub(1); dst_bucket[ends[br_2]] = src_bucket[right - 2]; - ends[br_2] = ends[br_2].wrapping_sub(1); - dst_bucket[offsets[bl_3]] = src_bucket[left + 3]; - offsets[bl_3] = offsets[bl_3].wrapping_add(1); + dst_bucket[prefix_sums[bl_3]] = src_bucket[left + 3]; + prefix_sums[bl_3] = prefix_sums[bl_3].wrapping_add(1); + ends[br_3] = ends[br_3].saturating_sub(1); dst_bucket[ends[br_3]] = src_bucket[right - 3]; - ends[br_3] = ends[br_3].wrapping_sub(1); left += 4; right -= 4; @@ -261,32 +248,23 @@ pub fn lr_out_of_place_sort( pub fn lr_out_of_place_sort_with_counts( src_bucket: &[T], dst_bucket: &mut [T], - counts: &[usize; 256], level: usize, -) -> [usize; 256] -where + prefix_sums: &mut PrefixSums, + ends: &mut EndOffsets, + next_counts: &mut Counts, + counts_scratch: &mut Counts, +) where T: RadixKey + Sized + Send + Copy + Sync, { if src_bucket.is_empty() { - return [0usize; 256]; + return; } else if src_bucket.len() == 1 { - let mut counts = [0usize; 256]; dst_bucket.copy_from_slice(src_bucket); - counts[src_bucket[0].get_level(level) as usize] = 1; - return counts; + next_counts[src_bucket[0].get_level(level) as usize] = 1; + return; } let next_level = level + 1; - let mut next_counts_0 = [0usize; 256]; - let mut next_counts_1 = [0usize; 256]; - - let mut offsets = get_prefix_sums(counts); - let mut ends = [0usize; 256]; - - for (i, b) in offsets.iter().enumerate() { - ends[i] = b + counts[i].saturating_sub(1); - } - let mut left = 0; let mut right = src_bucket.len() - 1; let pre = src_bucket.len() % 8; @@ -295,14 +273,14 @@ where let b = src_bucket[right].get_level(level) as usize; let bn = src_bucket[right].get_level(next_level) as usize; + ends[b] = ends[b].saturating_sub(1); dst_bucket[ends[b]] = src_bucket[right]; - ends[b] = ends[b].wrapping_sub(1); - right = right.wrapping_sub(1); - next_counts_0[bn] += 1; + right = right.saturating_sub(1); + next_counts[bn] += 1; } if pre == src_bucket.len() { - return next_counts_0; + return; } let end = (src_bucket.len() - pre) / 2; @@ -317,25 +295,25 @@ where let br_2 = src_bucket[right - 2].get_level(level) as usize; let br_3 = src_bucket[right - 3].get_level(level) as usize; - dst_bucket[offsets[bl_0]] = src_bucket[left]; + dst_bucket[prefix_sums[bl_0]] = src_bucket[left]; + ends[br_0] = ends[br_0].saturating_sub(1); dst_bucket[ends[br_0]] = src_bucket[right]; - ends[br_0] = ends[br_0].wrapping_sub(1); - offsets[bl_0] = offsets[bl_0].wrapping_add(1); + prefix_sums[bl_0] = prefix_sums[bl_0].wrapping_add(1); - dst_bucket[offsets[bl_1]] = src_bucket[left + 1]; + dst_bucket[prefix_sums[bl_1]] = src_bucket[left + 1]; + ends[br_1] = ends[br_1].saturating_sub(1); dst_bucket[ends[br_1]] = src_bucket[right - 1]; - ends[br_1] = ends[br_1].wrapping_sub(1); - offsets[bl_1] = offsets[bl_1].wrapping_add(1); + prefix_sums[bl_1] = prefix_sums[bl_1].wrapping_add(1); - dst_bucket[offsets[bl_2]] = src_bucket[left + 2]; + dst_bucket[prefix_sums[bl_2]] = src_bucket[left + 2]; + ends[br_2] = ends[br_2].saturating_sub(1); dst_bucket[ends[br_2]] = src_bucket[right - 2]; - ends[br_2] = ends[br_2].wrapping_sub(1); - offsets[bl_2] = offsets[bl_2].wrapping_add(1); + prefix_sums[bl_2] = prefix_sums[bl_2].wrapping_add(1); - dst_bucket[offsets[bl_3]] = src_bucket[left + 3]; + dst_bucket[prefix_sums[bl_3]] = src_bucket[left + 3]; + ends[br_3] = ends[br_3].saturating_sub(1); dst_bucket[ends[br_3]] = src_bucket[right - 3]; - ends[br_3] = ends[br_3].wrapping_sub(1); - offsets[bl_3] = offsets[bl_3].wrapping_add(1); + prefix_sums[bl_3] = prefix_sums[bl_3].wrapping_add(1); let bnl_0 = src_bucket[left].get_level(next_level) as usize; let bnl_1 = src_bucket[left + 1].get_level(next_level) as usize; @@ -346,22 +324,20 @@ where let bnr_2 = src_bucket[right - 2].get_level(next_level) as usize; let bnr_3 = src_bucket[right - 3].get_level(next_level) as usize; - next_counts_0[bnl_0] += 1; - next_counts_1[bnr_0] += 1; - next_counts_0[bnl_1] += 1; - next_counts_1[bnr_1] += 1; - next_counts_0[bnl_2] += 1; - next_counts_1[bnr_2] += 1; - next_counts_0[bnl_3] += 1; - next_counts_1[bnr_3] += 1; + next_counts[bnl_0] += 1; + counts_scratch[bnr_0] += 1; + next_counts[bnl_1] += 1; + counts_scratch[bnr_1] += 1; + next_counts[bnl_2] += 1; + counts_scratch[bnr_2] += 1; + next_counts[bnl_3] += 1; + counts_scratch[bnr_3] += 1; left += 4; - right -= 4; + right = right.wrapping_sub(4); } for i in 0..256 { - next_counts_0[i] += next_counts_1[i]; + next_counts[i] += counts_scratch[i]; } - - next_counts_0 } diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index 19a97b1..04fd17d 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -22,76 +22,89 @@ //! constraints. As this is an out-of-place algorithm, you need 2n memory relative to the input for //! this sort, and eventually the extra allocation and freeing required eats away at the performance. +use crate::counts::{CountManager, Counts}; use crate::sorter::Sorter; use crate::sorts::out_of_place_sort::out_of_place_sort; -use crate::utils::*; use crate::RadixKey; use arbitrary_chunks::ArbitraryChunks; use rayon::prelude::*; +use std::cell::RefCell; +use std::rc::Rc; pub fn recombinating_sort( + cm: &CountManager, bucket: &mut [T], - counts: &[usize; 256], - tile_counts: &[[usize; 256]], + counts: &Counts, + tile_counts: Vec, tile_size: usize, level: usize, ) where T: RadixKey + Sized + Send + Copy + Sync, { let bucket_len = bucket.len(); - let mut tmp_bucket = get_tmp_bucket::(bucket_len); - - let locals: Vec<([usize; 256], [usize; 256])> = bucket - .par_chunks(tile_size) - .zip(tmp_bucket.par_chunks_mut(tile_size)) - .zip(tile_counts.into_par_iter()) - .map(|((chunk, tmp_chunk), counts)| { - out_of_place_sort(chunk, tmp_chunk, counts, level); - - let sums = get_prefix_sums(counts); - - (*counts, sums) - }) - .collect(); - - bucket - .arbitrary_chunks_mut(counts) - .enumerate() - .par_bridge() - .for_each(|(index, global_chunk)| { - let mut read_offset = 0; - let mut write_offset = 0; - - for (counts, sums) in locals.iter() { - let read_start = read_offset + sums[index]; - let read_end = read_start + counts[index]; - let read_slice = &tmp_bucket[read_start..read_end]; - let write_end = write_offset + read_slice.len(); - - global_chunk[write_offset..write_end].copy_from_slice(read_slice); - - read_offset += tile_size; - write_offset = write_end; - } - }); + + cm.with_tmp_buffer(bucket_len, |cm, tmp_bucket| { + bucket + .par_chunks(tile_size) + .zip(tmp_bucket.par_chunks_mut(tile_size)) + .zip(tile_counts.par_iter()) + .for_each(|((chunk, tmp_chunk), counts)| { + let sums = cm.prefix_sums(counts); + out_of_place_sort(chunk, tmp_chunk, level, &mut sums.borrow_mut()); + cm.return_counts(sums); + }); + + bucket + .arbitrary_chunks_mut(counts.inner()) + .enumerate() + .par_bridge() + .for_each(|(index, global_chunk)| { + let mut read_offset = 0; + let mut write_offset = 0; + + for tile_c in tile_counts.iter() { + let sum = if index == 0 { + 0 + } else { + tile_c.into_iter().take(index).sum::() + }; + let read_start = read_offset + sum; + let read_end = read_start + tile_c[index]; + let read_slice = &tmp_bucket[read_start..read_end]; + let write_end = write_offset + read_slice.len(); + + global_chunk[write_offset..write_end].copy_from_slice(read_slice); + + read_offset += tile_size; + write_offset = write_end; + } + }); + }); } impl<'a> Sorter<'a> { pub(crate) fn recombinating_sort_adapter( &self, bucket: &mut [T], - counts: &[usize; 256], - tile_counts: &[[usize; 256]], + counts: Rc>, + tile_counts: Vec, tile_size: usize, level: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; } - recombinating_sort(bucket, counts, tile_counts, tile_size, level); + recombinating_sort( + &self.cm, + bucket, + &counts.borrow(), + tile_counts, + tile_size, + level, + ); if level == 0 { return; @@ -103,6 +116,8 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { + + use crate::counts::CountManager; use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; @@ -117,8 +132,6 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { let level = T::LEVELS - 1; let tile_size = cdiv(inputs.len(), current_num_threads()); @@ -127,16 +140,13 @@ mod tests { return; } - let (tile_counts, _) = get_tile_counts(inputs, tile_size, level); - let counts = aggregate_tile_counts(&tile_counts); + let cm = CountManager::default(); + let sorter = Sorter::new(true, &StandardTuner); + + let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, level); + let counts = aggregate_tile_counts(&cm, &tile_counts); - sorter.recombinating_sort_adapter( - inputs, - &counts, - &tile_counts, - tile_size, - T::LEVELS - 1, - ) + sorter.recombinating_sort_adapter(inputs, counts, tile_counts, tile_size, T::LEVELS - 1) }); } @@ -177,8 +187,6 @@ mod tests { #[test] pub fn test_u32_patterns() { - let sorter = Sorter::new(true, &StandardTuner); - validate_u32_patterns(|inputs| { let level = u32::LEVELS - 1; let tile_size = cdiv(inputs.len(), current_num_threads()); @@ -187,10 +195,13 @@ mod tests { return; } - let (tile_counts, _) = get_tile_counts(inputs, tile_size, level); - let counts = aggregate_tile_counts(&tile_counts); + let cm = CountManager::default(); + let sorter = Sorter::new(true, &StandardTuner); + + let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, level); + let counts = aggregate_tile_counts(&cm, &tile_counts); - sorter.recombinating_sort_adapter(inputs, &counts, &tile_counts, tile_size, level) + sorter.recombinating_sort_adapter(inputs, counts, tile_counts, tile_size, level) }); } } diff --git a/src/sorts/regions_sort.rs b/src/sorts/regions_sort.rs index ffe33da..eb216ef 100644 --- a/src/sorts/regions_sort.rs +++ b/src/sorts/regions_sort.rs @@ -40,12 +40,15 @@ use crate::sorter::Sorter; use crate::sorts::ska_sort::ska_sort; -use crate::utils::*; +use std::cell::RefCell; + +use crate::counts::{CountManager, Counts}; use crate::RadixKey; use partition::partition_index; use rayon::current_num_threads; use rayon::prelude::*; use std::cmp::{min, Ordering}; +use std::rc::Rc; /// Operation represents a pair of edges, which have content slices that need to be swapped. struct Operation<'bucket, T>(Edge<'bucket, T>, Edge<'bucket, T>); @@ -65,10 +68,10 @@ struct Edge<'bucket, T> { /// for that country. fn generate_outbounds<'bucket, T>( bucket: &'bucket mut [T], - local_counts: &[[usize; 256]], - global_counts: &[usize; 256], + local_counts: &[Counts], + global_counts: &Counts, ) -> Vec> { - let mut outbounds: Vec> = Vec::new(); + let mut outbounds: Vec> = Vec::with_capacity(256); let mut rem_bucket = bucket; let mut local_bucket = 0; let mut local_country = 0; @@ -123,37 +126,40 @@ fn generate_outbounds<'bucket, T>( } /// list_operations takes the lists of outbounds and turns it into a list of swaps to perform -fn list_operations( +fn list_operations<'a, T>( country: usize, - mut outbounds: Vec>, -) -> (Vec>, Vec>) { + outbounds: &mut Vec>, + operations: &mut Vec>, + inbounds_scratch: &mut Vec>, + outbounds_scratch: &mut Vec>, +) { + // 2. Calculate inbounds for country + let ib = partition_index(outbounds, |e| e.dst != country); + inbounds_scratch.extend(outbounds.drain(ib..)); + outbounds.truncate(ib); + // 1. Extract current country outbounds from full outbounds list // NOTE(nathan): Partitioning a single array benched faster than // keeping an array per country (256 arrays total). - let ob = partition_index(&mut outbounds, |e| e.init != country); - let mut current_outbounds = outbounds.split_off(ob); - - // 2. Calculate inbounds for country - let p = partition_index(&mut outbounds, |e| e.dst != country); - let mut inbounds = outbounds.split_off(p); + let ob = partition_index(outbounds, |e| e.init != country); + outbounds_scratch.extend(outbounds.drain(ob..)); + outbounds.truncate(ob); // 3. Pair up inbounds & outbounds into an operation, returning unmatched data to the working arrays - let mut operations = Vec::new(); - loop { - let i = match inbounds.pop() { + let i = match inbounds_scratch.pop() { Some(i) => i, None => { - outbounds.append(&mut current_outbounds); + outbounds.append(outbounds_scratch); break; } }; - let o = match current_outbounds.pop() { + let o = match outbounds_scratch.pop() { Some(o) => o, None => { outbounds.push(i); - outbounds.append(&mut inbounds); + outbounds.append(inbounds_scratch); break; } }; @@ -163,7 +169,7 @@ fn list_operations( Ordering::Less => { let (sl, rem) = o.slice.split_at_mut(i.slice.len()); - current_outbounds.push(Edge { + outbounds_scratch.push(Edge { dst: o.dst, init: o.init, slice: rem, @@ -180,7 +186,7 @@ fn list_operations( Ordering::Greater => { let (sl, rem) = i.slice.split_at_mut(o.slice.len()); - inbounds.push(Edge { + inbounds_scratch.push(Edge { dst: i.dst, init: i.init, slice: rem, @@ -198,15 +204,13 @@ fn list_operations( operations.push(op); } - - // 4. Return the paired operations - (outbounds, operations) } pub fn regions_sort( + cm: &CountManager, bucket: &mut [T], - counts: &[usize; 256], - tile_counts: &[[usize; 256]], + counts: &Counts, + tile_counts: Vec, tile_size: usize, level: usize, ) where @@ -217,13 +221,22 @@ pub fn regions_sort( .par_chunks_mut(tile_size) .zip(tile_counts.par_iter()) .for_each(|(chunk, counts)| { - let mut prefix_sums = get_prefix_sums(counts); - let end_offsets = get_end_offsets(counts, &prefix_sums); - ska_sort(chunk, &mut prefix_sums, &end_offsets, level); + let prefix_sums = cm.prefix_sums(counts); + let end_offsets = cm.end_offsets(counts, &prefix_sums.borrow()); + ska_sort( + chunk, + &mut prefix_sums.borrow_mut(), + &end_offsets.borrow(), + level, + ); + cm.return_counts(prefix_sums); + cm.return_counts(end_offsets); }); - let mut outbounds = generate_outbounds(bucket, tile_counts, counts); - let mut operations = Vec::new(); + let mut outbounds = generate_outbounds(bucket, &tile_counts, counts); + let mut operations = Vec::with_capacity(2048); + let mut inbounds_scratch = Vec::with_capacity(256); + let mut outbounds_scratch = Vec::with_capacity(256); // This loop calculates and executes all operations that can be done in parallel, each pass. loop { @@ -233,9 +246,13 @@ pub fn regions_sort( // List out all the operations that need to be executed in this pass for country in 0..256 { - let (new_outbounds, mut new_ops) = list_operations(country, outbounds); - outbounds = new_outbounds; - operations.append(&mut new_ops); + list_operations( + country, + &mut outbounds, + &mut operations, + &mut inbounds_scratch, + &mut outbounds_scratch, + ); } if operations.is_empty() { @@ -265,18 +282,22 @@ impl<'a> Sorter<'a> { pub(crate) fn regions_sort_adapter( &self, bucket: &mut [T], - counts: &[usize; 256], - tile_counts: &[[usize; 256]], + counts: Rc>, + tile_counts: Vec, tile_size: usize, level: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; } - regions_sort(bucket, counts, tile_counts, tile_size, level); + let c = counts.borrow(); + + regions_sort(&self.cm, bucket, &c, tile_counts, tile_size, level); + + drop(c); if level == 0 { return; @@ -288,6 +309,7 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { + use crate::counts::CountManager; use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; @@ -302,18 +324,18 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { + let cm = CountManager::default(); + let sorter = Sorter::new(true, &StandardTuner); if inputs.len() == 0 { return; } let tile_size = cdiv(inputs.len(), current_num_threads()); - let (tile_counts, _) = get_tile_counts(inputs, tile_size, T::LEVELS - 1); - let counts = aggregate_tile_counts(&tile_counts); + let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, T::LEVELS - 1); + let counts = aggregate_tile_counts(&cm, &tile_counts); - sorter.regions_sort_adapter(inputs, &counts, &tile_counts, tile_size, T::LEVELS - 1); + sorter.regions_sort_adapter(inputs, counts, tile_counts, tile_size, T::LEVELS - 1); }); } @@ -354,18 +376,19 @@ mod tests { #[test] pub fn test_u32_patterns() { - let sorter = Sorter::new(true, &StandardTuner); - validate_u32_patterns(|inputs| { if inputs.len() == 0 { return; } + let cm = CountManager::default(); + let sorter = Sorter::new(true, &StandardTuner); + let tile_size = cdiv(inputs.len(), current_num_threads()); - let (tile_counts, _) = get_tile_counts(inputs, tile_size, u32::LEVELS - 1); - let counts = aggregate_tile_counts(&tile_counts); + let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, u32::LEVELS - 1); + let counts = aggregate_tile_counts(&cm, &tile_counts); - sorter.regions_sort_adapter(inputs, &counts, &tile_counts, tile_size, u32::LEVELS - 1); + sorter.regions_sort_adapter(inputs, counts, tile_counts, tile_size, u32::LEVELS - 1); }); } } diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index c96748f..6c3077c 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -33,14 +33,16 @@ //! overhead of the thread-local stores and mutexes prevents it from being fast for smaller inputs //! however, so it should not be used in all situations. +use crate::counts::{Counts, PrefixSums}; use crate::sorter::Sorter; -use crate::utils::*; use crate::RadixKey; use arbitrary_chunks::ArbitraryChunks; use partition::partition_index; use rayon::current_num_threads; use rayon::prelude::*; +use std::cell::RefCell; use std::cmp::{max, min}; +use std::rc::Rc; use std::sync::Mutex; struct ScannerBucketInner<'a, T> { @@ -58,13 +60,13 @@ struct ScannerBucket<'a, T> { #[inline] fn get_scanner_buckets<'a, T>( - counts: &[usize; 256], - prefix_sums: &[usize; 256], + counts: &Counts, + prefix_sums: &PrefixSums, bucket: &'a mut [T], ) -> Vec> { let mut running_count = 0; let mut out: Vec<_> = bucket - .arbitrary_chunks_mut(counts) + .arbitrary_chunks_mut(counts.inner()) .enumerate() .map(|(index, chunk)| { let head = prefix_sums[index] - running_count; @@ -97,8 +99,7 @@ fn scanner_thread( ) where T: RadixKey + Copy, { - let mut stash: Vec> = Vec::with_capacity(256); - stash.resize(256, Vec::with_capacity(128)); + let mut stash: Vec> = vec![Vec::with_capacity(128); 256]; let mut finished_count = 0; let mut finished_map = [false; 256]; @@ -201,11 +202,10 @@ fn scanner_thread( let to_write = to_write as usize; let split = stash[m.index].len() - to_write; - let some = stash[m.index].split_off(split); let end = guard.write_head + to_write; let start = guard.write_head; - - guard.chunk[start..end].copy_from_slice(&some); + guard.chunk[start..end].copy_from_slice(&stash[m.index][split..]); + stash[m.index].truncate(split); guard.write_head += to_write; @@ -221,15 +221,14 @@ fn scanner_thread( } } -pub fn scanning_sort(bucket: &mut [T], counts: &[usize; 256], level: usize) +pub fn scanning_sort(bucket: &mut [T], counts: &Counts, prefix_sums: &PrefixSums, level: usize) where T: RadixKey + Sized + Send + Copy + Sync, { let len = bucket.len(); let threads = current_num_threads(); let uniform_threshold = ((len / threads) as f64 * 1.4) as usize; - let prefix_sums = get_prefix_sums(counts); - let scanner_buckets = get_scanner_buckets(counts, &prefix_sums, bucket); + let scanner_buckets = get_scanner_buckets(counts, prefix_sums, bucket); let threads = min(threads, scanner_buckets.len()); let scaling_factor = max(1, (threads as f32).log2().ceil() as isize) as usize; let scanner_read_size = (32768 / scaling_factor) as isize; @@ -251,16 +250,18 @@ impl<'a> Sorter<'a> { pub(crate) fn scanning_sort_adapter( &self, bucket: &mut [T], - counts: &[usize; 256], + counts: Rc>, level: usize, ) where - T: RadixKey + Sized + Send + Copy + Sync, + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; } - scanning_sort(bucket, counts, level); + let prefix_sums = self.cm.prefix_sums(&counts.borrow()); + scanning_sort(bucket, &counts.borrow(), &prefix_sums.borrow(), level); + self.cm.return_counts(prefix_sums); if level == 0 { return; @@ -272,10 +273,10 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { + use crate::counts::CountManager; use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::par_get_counts; use crate::utils::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; @@ -285,12 +286,12 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { - let (counts, _) = par_get_counts(inputs, T::LEVELS - 1); + let cm = CountManager::default(); + let sorter = Sorter::new(true, &StandardTuner); + let (counts, _) = cm.counts(inputs, T::LEVELS - 1); - sorter.scanning_sort_adapter(inputs, &counts, T::LEVELS - 1) + sorter.scanning_sort_adapter(inputs, counts, T::LEVELS - 1) }); } @@ -331,12 +332,12 @@ mod tests { #[test] pub fn test_u32_patterns() { - let sorter = Sorter::new(true, &StandardTuner); - validate_u32_patterns(|inputs| { - let (counts, _) = par_get_counts(inputs, u32::LEVELS - 1); + let cm = CountManager::default(); + let sorter = Sorter::new(true, &StandardTuner); + let (counts, _) = cm.counts(inputs, u32::LEVELS - 1); - sorter.scanning_sort_adapter(inputs, &counts, u32::LEVELS - 1) + sorter.scanning_sort_adapter(inputs, counts, u32::LEVELS - 1) }); } } diff --git a/src/sorts/ska_sort.rs b/src/sorts/ska_sort.rs index 934968d..df0eea5 100644 --- a/src/sorts/ska_sort.rs +++ b/src/sorts/ska_sort.rs @@ -20,15 +20,17 @@ //! This is generally slower than `lsb_sort` for smaller types T or smaller input arrays. For larger //! types or inputs, the memory efficiency of this algorithm can make it faster than `lsb_sort`. +use crate::counts::{Counts, EndOffsets, PrefixSums}; use crate::sorter::Sorter; -use crate::utils::*; use crate::RadixKey; use partition::partition_index; +use std::cell::RefCell; +use std::rc::Rc; pub fn ska_sort( bucket: &mut [T], - prefix_sums: &mut [usize; 256], - end_offsets: &[usize; 256], + prefix_sums: &mut PrefixSums, + end_offsets: &EndOffsets, level: usize, ) where T: RadixKey + Sized + Send + Copy + Sync, @@ -89,18 +91,30 @@ pub fn ska_sort( } impl<'a> Sorter<'a> { - pub(crate) fn ska_sort_adapter(&self, bucket: &mut [T], counts: &[usize; 256], level: usize) - where - T: RadixKey + Sized + Send + Copy + Sync, + pub(crate) fn ska_sort_adapter( + &self, + bucket: &mut [T], + counts: Rc>, + level: usize, + ) where + T: RadixKey + Sized + Send + Copy + Sync + 'a, { if bucket.len() < 2 { return; } - let mut prefix_sums = get_prefix_sums(counts); - let end_offsets = get_end_offsets(counts, &prefix_sums); + let prefix_sums = self.cm.prefix_sums(&counts.borrow()); + let end_offsets = self.cm.end_offsets(&counts.borrow(), &prefix_sums.borrow()); + + ska_sort( + bucket, + &mut prefix_sums.borrow_mut(), + &end_offsets.borrow(), + level, + ); - ska_sort(bucket, &mut prefix_sums, &end_offsets, level); + self.cm.return_counts(prefix_sums); + self.cm.return_counts(end_offsets); if level == 0 { return; @@ -115,7 +129,6 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::get_counts; use crate::utils::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; @@ -125,12 +138,11 @@ mod tests { where T: NumericTest, { - let sorter = Sorter::new(true, &StandardTuner); - sort_comparison_suite(shift, |inputs| { - let (counts, _) = get_counts(inputs, T::LEVELS - 1); + let sorter = Sorter::new(true, &StandardTuner); + let (counts, _) = sorter.cm.counts(inputs, T::LEVELS - 1); - sorter.ska_sort_adapter(inputs, &counts, T::LEVELS - 1); + sorter.ska_sort_adapter(inputs, counts, T::LEVELS - 1); }); } @@ -171,12 +183,11 @@ mod tests { #[test] pub fn test_u32_patterns() { - let sorter = Sorter::new(true, &StandardTuner); - validate_u32_patterns(|inputs| { - let (counts, _) = get_counts(inputs, u32::LEVELS - 1); + let sorter = Sorter::new(true, &StandardTuner); + let (counts, _) = sorter.cm.counts(inputs, u32::LEVELS - 1); - sorter.ska_sort_adapter(inputs, &counts, u32::LEVELS - 1); + sorter.ska_sort_adapter(inputs, counts, u32::LEVELS - 1); }); } } diff --git a/src/utils/sort_utils.rs b/src/utils/sort_utils.rs index a6f6dc2..6eca8d2 100644 --- a/src/utils/sort_utils.rs +++ b/src/utils/sort_utils.rs @@ -1,208 +1,9 @@ +use crate::counts::{CountManager, CountMeta, Counts}; use crate::RadixKey; #[cfg(feature = "multi-threaded")] use rayon::prelude::*; -#[cfg(feature = "multi-threaded")] -use std::sync::mpsc::channel; - -#[inline] -pub fn get_prefix_sums(counts: &[usize; 256]) -> [usize; 256] { - let mut sums = [0usize; 256]; - - let mut running_total = 0; - for (i, c) in counts.iter().enumerate() { - sums[i] = running_total; - running_total += c; - } - - sums -} - -#[inline] -pub fn get_end_offsets(counts: &[usize; 256], prefix_sums: &[usize; 256]) -> [usize; 256] { - let mut end_offsets = [0usize; 256]; - - end_offsets[0..255].copy_from_slice(&prefix_sums[1..256]); - end_offsets[255] = counts[255] + prefix_sums[255]; - - end_offsets -} - -#[inline] -#[cfg(any(test, bench, tuning))] -pub fn par_get_counts(bucket: &[T], level: usize) -> ([usize; 256], bool) -where - T: RadixKey + Sized + Send + Sync, -{ - if bucket.len() == 0 { - return ([0usize; 256], true); - } - - let (counts, sorted, _, _) = par_get_counts_with_ends(bucket, level); - (counts, sorted) -} - -#[inline] -#[cfg(feature = "multi-threaded")] -pub fn par_get_counts_with_ends(bucket: &[T], level: usize) -> ([usize; 256], bool, u8, u8) -where - T: RadixKey + Sized + Send + Sync, -{ - #[cfg(feature = "work_profiles")] - println!("({}) PAR_COUNT", level); - - if bucket.len() < 400_000 { - return get_counts_with_ends(bucket, level); - } - - let threads = rayon::current_num_threads(); - let chunk_divisor = 8; - let chunk_size = (bucket.len() / threads / chunk_divisor) + 1; - let chunks = bucket.par_chunks(chunk_size); - let len = chunks.len(); - let (tx, rx) = channel(); - - chunks.enumerate().for_each_with(tx, |tx, (i, chunk)| { - let counts = get_counts_with_ends(chunk, level); - tx.send((i, counts.0, counts.1, counts.2, counts.3)) - .unwrap(); - }); - - let mut msb_counts = [0usize; 256]; - let mut already_sorted = true; - let mut boundaries = vec![(0u8, 0u8); len]; - - for _ in 0..len { - let (i, counts, chunk_sorted, start, end) = rx.recv().unwrap(); - - if !chunk_sorted { - already_sorted = false; - } - - boundaries[i].0 = start; - boundaries[i].1 = end; - - for (i, c) in counts.iter().enumerate() { - msb_counts[i] += *c; - } - } - - // Check the boundaries of each counted chunk, to see if the full bucket - // is already sorted - if already_sorted { - for w in boundaries.windows(2) { - if w[1].0 < w[0].1 { - already_sorted = false; - break; - } - } - } - - ( - msb_counts, - already_sorted, - boundaries[0].0, - boundaries[boundaries.len() - 1].1, - ) -} - -#[inline] -pub fn get_counts_with_ends(bucket: &[T], level: usize) -> ([usize; 256], bool, u8, u8) -where - T: RadixKey, -{ - #[cfg(feature = "work_profiles")] - println!("({}) COUNT", level); - - let mut already_sorted = true; - let mut continue_from = bucket.len(); - let mut counts_1 = [0usize; 256]; - let mut last = 0usize; - - for (i, item) in bucket.iter().enumerate() { - let b = item.get_level(level) as usize; - counts_1[b] += 1; - - if b < last { - continue_from = i + 1; - already_sorted = false; - break; - } - - last = b; - } - - if continue_from == bucket.len() { - return ( - counts_1, - already_sorted, - bucket[0].get_level(level), - last as u8, - ); - } - - let mut counts_2 = [0usize; 256]; - let mut counts_3 = [0usize; 256]; - let mut counts_4 = [0usize; 256]; - let chunks = bucket[continue_from..].chunks_exact(4); - let rem = chunks.remainder(); - - chunks.into_iter().for_each(|chunk| { - let a = chunk[0].get_level(level) as usize; - let b = chunk[1].get_level(level) as usize; - let c = chunk[2].get_level(level) as usize; - let d = chunk[3].get_level(level) as usize; - - counts_1[a] += 1; - counts_2[b] += 1; - counts_3[c] += 1; - counts_4[d] += 1; - }); - - rem.iter().for_each(|v| { - let b = v.get_level(level) as usize; - counts_1[b] += 1; - }); - - for i in 0..256 { - counts_1[i] += counts_2[i]; - counts_1[i] += counts_3[i]; - counts_1[i] += counts_4[i]; - } - - let b_first = bucket.first().unwrap().get_level(level); - let b_last = bucket.last().unwrap().get_level(level); - - (counts_1, already_sorted, b_first, b_last) -} - -#[inline] -pub fn get_counts(bucket: &[T], level: usize) -> ([usize; 256], bool) -where - T: RadixKey, -{ - if bucket.is_empty() { - return ([0usize; 256], true); - } - - let (counts, sorted, _, _) = get_counts_with_ends(bucket, level); - - (counts, sorted) -} - -#[allow(clippy::uninit_vec)] -#[inline] -pub fn get_tmp_bucket(len: usize) -> Vec { - let mut tmp_bucket = Vec::with_capacity(len); - unsafe { - // Safety: This will leave the vec with potentially uninitialized data - // however as we account for every value when placing things - // into tmp_bucket, this is "safe". This is used because it provides a - // very significant speed improvement over resize, to_vec etc. - tmp_bucket.set_len(len); - } - - tmp_bucket -} +use std::cell::RefCell; +use std::rc::Rc; #[inline] pub const fn cdiv(a: usize, b: usize) -> usize { @@ -210,59 +11,80 @@ pub const fn cdiv(a: usize, b: usize) -> usize { } #[inline] -pub fn get_tile_counts(bucket: &[T], tile_size: usize, level: usize) -> (Vec<[usize; 256]>, bool) +pub fn get_tile_counts( + cm: &CountManager, + bucket: &[T], + tile_size: usize, + level: usize, +) -> (Vec, bool) where T: RadixKey + Copy + Sized + Send + Sync, { #[cfg(feature = "work_profiles")] println!("({}) TILE_COUNT", level); + let num_tiles = cdiv(bucket.len(), tile_size); + let mut tiles: Vec = vec![Counts::new(); num_tiles]; + let mut meta: Vec = vec![CountMeta::default(); num_tiles]; + #[cfg(feature = "multi-threaded")] - let tiles: Vec<([usize; 256], bool, u8, u8)> = bucket + bucket .par_chunks(tile_size) - .map(|chunk| par_get_counts_with_ends(chunk, level)) - .collect(); + .zip(tiles.par_iter_mut()) + .zip(meta.par_iter_mut()) + .for_each(|((chunk, counts), meta)| { + cm.count_into(counts, meta, chunk, level); + }); #[cfg(not(feature = "multi-threaded"))] - let tiles: Vec<([usize; 256], bool, u8, u8)> = bucket + bucket .chunks(tile_size) - .map(|chunk| get_counts_with_ends(chunk, level)) - .collect(); + .zip(tiles.par_iter_mut()) + .zip(meta.par_iter_mut()) + .for_each(|((chunk, counts), meta)| { + cm.count_into(counts, meta, chunk, level); + }); let mut all_sorted = true; if tiles.len() == 1 { // If there is only one tile, we already have a flag for if it is sorted - all_sorted = tiles[0].1; + all_sorted = meta[0].already_sorted; } else { // Check if any of the tiles, or any of the tile boundaries are unsorted - for tile in tiles.windows(2) { - if !tile[0].1 || !tile[1].1 || tile[1].2 < tile[0].3 { + for w in meta.windows(2) { + let left = &w[0]; + let right = &w[1]; + if !left.already_sorted || !right.already_sorted || right.first < left.last { all_sorted = false; break; } } } - (tiles.into_iter().map(|v| v.0).collect(), all_sorted) + (tiles, all_sorted) } #[inline] -pub fn aggregate_tile_counts(tile_counts: &[[usize; 256]]) -> [usize; 256] { - let mut out = tile_counts[0]; - for tile in tile_counts.iter().skip(1) { - for i in 0..256 { - out[i] += tile[i]; +pub fn aggregate_tile_counts(cm: &CountManager, tile_counts: &[Counts]) -> Rc> { + let out = cm.get_empty_counts(); + let mut counts = out.borrow_mut(); + + for tile in tile_counts.iter() { + for i in 0..256usize { + counts.add(i, tile[i]); } } + drop(counts); + out } #[inline] -pub fn is_homogenous_bucket(counts: &[usize; 256]) -> bool { +pub fn is_homogenous(counts: &Counts) -> bool { let mut seen = false; - for c in counts { + for c in counts.into_iter() { if *c > 0 { if seen { return false; @@ -277,31 +99,34 @@ pub fn is_homogenous_bucket(counts: &[usize; 256]) -> bool { #[cfg(test)] mod tests { + use crate::counts::CountManager; use crate::utils::get_tile_counts; #[test] pub fn test_get_tile_counts_correctly_marks_already_sorted_single_tile() { + let cm = CountManager::default(); let mut data: Vec = vec![0, 5, 2, 3, 1]; - let (_counts, already_sorted) = get_tile_counts(&mut data, 5, 0); + let (_counts, already_sorted) = get_tile_counts(&cm, &mut data, 5, 0); assert_eq!(already_sorted, false); let mut data: Vec = vec![0, 0, 1, 1, 2]; - let (_counts, already_sorted) = get_tile_counts(&mut data, 5, 0); + let (_counts, already_sorted) = get_tile_counts(&cm, &mut data, 5, 0); assert_eq!(already_sorted, true); } #[test] pub fn test_get_tile_counts_correctly_marks_already_sorted_multiple_tiles() { + let cm = CountManager::default(); let mut data: Vec = vec![0, 5, 2, 3, 1]; - let (_counts, already_sorted) = get_tile_counts(&mut data, 2, 0); + let (_counts, already_sorted) = get_tile_counts(&cm, &mut data, 2, 0); assert_eq!(already_sorted, false); let mut data: Vec = vec![0, 0, 1, 1, 2]; - let (_counts, already_sorted) = get_tile_counts(&mut data, 2, 0); + let (_counts, already_sorted) = get_tile_counts(&cm, &mut data, 2, 0); assert_eq!(already_sorted, true); } } diff --git a/src/utils/test_utils.rs b/src/utils/test_utils.rs index ad751a2..39de5aa 100644 --- a/src/utils/test_utils.rs +++ b/src/utils/test_utils.rs @@ -96,7 +96,7 @@ where pub fn validate_sort(mut inputs: Vec, sort_fn: F) where - T: NumericTest, + T: NumericTest + Debug, F: Fn(&mut [T]), { let mut inputs_clone = inputs.clone(); @@ -117,6 +117,18 @@ where } inputs_clone.sort_unstable(); + + for i in 0..inputs.len() { + let a = inputs[i]; + let b = inputs_clone[i]; + assert_eq!( + a, + b, + "Mismatch at index {:?} vs. {:?}", + inputs[i - 5..i + 5].to_vec(), + inputs_clone[i - 5..i + 5].to_vec() + ); + } assert_eq!(inputs, inputs_clone); } From 165ddbdcf504c5e2baf8bc39f84a9e2d1a89affe Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Tue, 30 Jan 2024 11:52:53 +0900 Subject: [PATCH 02/24] Restore correct version of arbitrary-chunks --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 0d268cd..7718724 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ timings = ["multi-threaded"] [dependencies] rayon = { version = "1.8", optional = true } -arbitrary-chunks = { path = "../arbitrary-chunks" } +arbitrary-chunks = "0.4.1" partition = "0.1.2" bumpalo = { version = "3.14.0", features = ["collections"] } From efecead7574ec8e03817da7df8fafd96aeb0e308 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Tue, 30 Jan 2024 12:40:16 +0900 Subject: [PATCH 03/24] Remove unused features of Count --- src/counts.rs | 69 +++-------------------------------------- src/utils/sort_utils.rs | 2 +- 2 files changed, 5 insertions(+), 66 deletions(-) diff --git a/src/counts.rs b/src/counts.rs index ba08747..dcc626c 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -192,7 +192,7 @@ impl Counter { return; } else if bucket.len() == 1 { let b = bucket[0].get_level(level) as usize; - counts.inc(b); + counts[b] += 1; meta.first = b as u8; meta.last = b as u8; @@ -210,7 +210,7 @@ impl Counter { // First, count directly into the output buffer until we find a value that is out of order. for (i, item) in bucket.iter().enumerate() { let b = item.get_level(level) as usize; - counts.inc(b); + counts[b] += 1; if b < prev { continue_from = i + 1; @@ -245,12 +245,12 @@ impl Counter { rem.iter().for_each(|v| { let b = v.get_level(level) as usize; - counts.inc(b); + counts[b] += 1; }); for i in 0..256 { let agg = self.0[i * 4] + self.0[1 + i * 4] + self.0[2 + i * 4] + self.0[3 + i * 4]; - counts.add(i, agg); + counts[i] += agg; } meta.first = first; @@ -259,45 +259,12 @@ impl Counter { } } -pub struct CountIter<'a>(&'a Counts, usize); - -pub struct CountIterEnumerable<'a>(&'a mut CountIter<'a>); - -impl<'a> CountIter<'a> { - #[inline(always)] - pub fn enumerate(&'a mut self) -> CountIterEnumerable<'a> { - CountIterEnumerable(self) - } -} - impl Counts { #[inline(always)] pub fn clear(&mut self) { self.0.iter_mut().for_each(|x| *x = 0); } - #[inline(always)] - pub fn get_count(self, radix: usize) -> usize { - debug_assert!(radix < 256); - unsafe { *self.0.get_unchecked(radix) } - } - - #[inline(always)] - pub fn inc(&mut self, radix: usize) { - debug_assert!(radix < 256); - unsafe { - *self.0.get_unchecked_mut(radix) += 1; - } - } - - #[inline(always)] - pub fn add(&mut self, radix: usize, count: usize) { - debug_assert!(radix < 256); - unsafe { - *self.0.get_unchecked_mut(radix) += count; - } - } - #[inline(always)] pub fn new() -> Self { Self::default() @@ -309,34 +276,6 @@ impl Counts { } } -impl Iterator for CountIter<'_> { - type Item = usize; - - #[inline(always)] - fn next(&mut self) -> Option { - if self.1 == 256 { - return None; - } - - let out = self.0[self.1]; - self.1 += 1; - - Some(out) - } - - #[inline(always)] - fn size_hint(&self) -> (usize, Option) { - (256 - self.1, Some(256 - self.1)) - } -} - -impl ExactSizeIterator for CountIter<'_> { - #[inline(always)] - fn len(&self) -> usize { - 256 - self.1 - } -} - impl IntoIterator for Counts { type Item = usize; type IntoIter = core::array::IntoIter; diff --git a/src/utils/sort_utils.rs b/src/utils/sort_utils.rs index 6eca8d2..d6b6795 100644 --- a/src/utils/sort_utils.rs +++ b/src/utils/sort_utils.rs @@ -72,7 +72,7 @@ pub fn aggregate_tile_counts(cm: &CountManager, tile_counts: &[Counts]) -> Rc Date: Sun, 4 Feb 2024 14:42:51 +0900 Subject: [PATCH 04/24] Faster default implementations of RadixKey --- src/radix_key_impl.rs | 94 +++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/src/radix_key_impl.rs b/src/radix_key_impl.rs index a9e6336..05e88e0 100644 --- a/src/radix_key_impl.rs +++ b/src/radix_key_impl.rs @@ -3,7 +3,7 @@ use crate::RadixKey; impl RadixKey for u8 { const LEVELS: usize = 1; - #[inline] + #[inline(always)] fn get_level(&self, _: usize) -> u8 { *self } @@ -12,36 +12,60 @@ impl RadixKey for u8 { impl RadixKey for u16 { const LEVELS: usize = 2; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } impl RadixKey for u32 { const LEVELS: usize = 4; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } impl RadixKey for u64 { const LEVELS: usize = 8; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } impl RadixKey for u128 { const LEVELS: usize = 16; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } @@ -49,9 +73,15 @@ impl RadixKey for u128 { impl RadixKey for usize { const LEVELS: usize = 2; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } @@ -59,9 +89,15 @@ impl RadixKey for usize { impl RadixKey for usize { const LEVELS: usize = 4; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } @@ -69,16 +105,22 @@ impl RadixKey for usize { impl RadixKey for usize { const LEVELS: usize = 8; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { - (self >> (level * 8)) as u8 + debug_assert!(level < Self::LEVELS); + + if cfg!(target_endian="little") { + unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + } else { + (self >> (level * 8)) as u8 + } } } impl RadixKey for [u8; N] { const LEVELS: usize = N; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { self[level] } @@ -87,7 +129,7 @@ impl RadixKey for [u8; N] { impl RadixKey for i8 { const LEVELS: usize = 1; - #[inline] + #[inline(always)] fn get_level(&self, _: usize) -> u8 { (*self ^ i8::MIN) as u8 } @@ -96,7 +138,7 @@ impl RadixKey for i8 { impl RadixKey for i16 { const LEVELS: usize = 2; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ i16::MIN) >> (level * 8)) as u8 } @@ -105,7 +147,7 @@ impl RadixKey for i16 { impl RadixKey for i32 { const LEVELS: usize = 4; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ i32::MIN) >> (level * 8)) as u8 } @@ -114,7 +156,7 @@ impl RadixKey for i32 { impl RadixKey for i64 { const LEVELS: usize = 8; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ i64::MIN) >> (level * 8)) as u8 } @@ -123,7 +165,7 @@ impl RadixKey for i64 { impl RadixKey for i128 { const LEVELS: usize = 16; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ i128::MIN) >> (level * 8)) as u8 } @@ -133,7 +175,7 @@ impl RadixKey for i128 { impl RadixKey for isize { const LEVELS: usize = 2; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ isize::MIN) >> (level * 8)) as u8 } @@ -143,7 +185,7 @@ impl RadixKey for isize { impl RadixKey for isize { const LEVELS: usize = 4; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ isize::MIN) >> (level * 8)) as u8 } @@ -153,7 +195,7 @@ impl RadixKey for isize { impl RadixKey for isize { const LEVELS: usize = 8; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { ((self ^ isize::MIN) >> (level * 8)) as u8 } @@ -162,7 +204,7 @@ impl RadixKey for isize { impl RadixKey for f32 { const LEVELS: usize = 4; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { let mut s = self.to_bits() as i32; @@ -175,7 +217,7 @@ impl RadixKey for f32 { impl RadixKey for f64 { const LEVELS: usize = 8; - #[inline] + #[inline(always)] fn get_level(&self, level: usize) -> u8 { let mut s = self.to_bits() as i64; s ^= (((s >> 63) as u64) >> 1) as i64; From 8d2ab558713a7f3467d18382d0619674a856ae87 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Mon, 5 Feb 2024 11:12:52 +0900 Subject: [PATCH 05/24] Use temporary buffers copied from source rather than uninitialized buffers due to undefined behavior --- src/counts.rs | 27 ++++++++++++++++++++------- src/sorts/lsb_sort.rs | 4 ++-- src/sorts/mt_lsb_sort.rs | 4 ++-- src/sorts/recombinating_sort.rs | 4 +--- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/src/counts.rs b/src/counts.rs index dcc626c..23c2290 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -148,18 +148,31 @@ impl CountManager { } #[inline(always)] - pub fn with_tmp_buffer( - &self, - len: usize, - mut f: F, - ) { + pub fn with_tmp_buffer(&self, src_bucket: &mut [T], mut f: F) + where + T: Copy, + F: FnMut(&CountManager, &mut [T], &mut [T]), + { + let len = src_bucket.len(); Self::THREAD_CTX.with(|ct| { let mut tmp = bumpalo::collections::Vec::with_capacity_in(len, &ct.bump); - // Safety: It's up to the caller to ensure that all values in the tmp buffer are overwritten before use + + // Safety: Vec has the same capacity as the input size, and set_len is not called to set + // the full vec len until after all data has been initialized. Source data is Copy + // so a full copy of the source data is sufficient for initialization. + + // Note: This is done rather than something like extend because the performance + // is significantly better. Extend and co. use push and therefore increment len for each item + // where this simply copies all the data first then sets len once. + + // Existing data is used rather than just leaving it uninitialized until write because + // doing so is undefined behavior and MaybeUninit creates a bunch of code duplication and bloat. + // There's only a minor performance hit for using the copy instead of overwriting uninitialized data. unsafe { + std::ptr::copy_nonoverlapping(src_bucket.as_ptr(), tmp.as_mut_ptr(), len); tmp.set_len(len); } - f(self, &mut tmp); + f(self, src_bucket, &mut tmp); drop(tmp); }) } diff --git a/src/sorts/lsb_sort.rs b/src/sorts/lsb_sort.rs index 1c583e2..c3a0f0b 100644 --- a/src/sorts/lsb_sort.rs +++ b/src/sorts/lsb_sort.rs @@ -54,12 +54,12 @@ impl<'a> Sorter<'a> { return; } - self.cm.with_tmp_buffer(bucket.len(), |cm, tmp_bucket| { + self.cm.with_tmp_buffer(bucket, |cm, bucket, tmp_bucket| { let mut invert = false; let mut use_next_counts = false; let mut counts = cm.get_empty_counts(); let mut meta = CountMeta::default(); - let mut next_counts: Rc> = cm.get_empty_counts(); + let mut next_counts = cm.get_empty_counts(); for level in start_level..=end_level { if level == end_level { diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index 49e22f4..a6526a7 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -151,7 +151,7 @@ impl<'a> Sorter<'a> { return; } - self.cm.with_tmp_buffer(bucket.len(), |_, tmp_bucket| { + self.cm.with_tmp_buffer(bucket, |_, bucket, tmp_bucket| { let levels: Vec = (start_level..=end_level).collect(); let mut invert = false; @@ -200,7 +200,7 @@ impl<'a> Sorter<'a> { return; } - self.cm.with_tmp_buffer(bucket.len(), |_, tmp_bucket| { + self.cm.with_tmp_buffer(bucket, |_, bucket, tmp_bucket| { mt_lsb_sort(bucket, tmp_bucket, &tile_counts, tile_size, level); bucket diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index 04fd17d..acb187c 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -41,9 +41,7 @@ pub fn recombinating_sort( ) where T: RadixKey + Sized + Send + Copy + Sync, { - let bucket_len = bucket.len(); - - cm.with_tmp_buffer(bucket_len, |cm, tmp_bucket| { + cm.with_tmp_buffer(bucket, |cm, bucket, tmp_bucket| { bucket .par_chunks(tile_size) .zip(tmp_bucket.par_chunks_mut(tile_size)) From 2ee4670bec21c69f8b79e6580c766b7778b71473 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Mon, 5 Feb 2024 11:13:28 +0900 Subject: [PATCH 06/24] Formatting & remove random extern --- src/cmd/profiling.rs | 6 ++--- src/lib.rs | 2 -- src/radix_key_impl.rs | 56 ++++++++++++++++++++++++++++++++----------- 3 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/cmd/profiling.rs b/src/cmd/profiling.rs index 5de799f..de0f3db 100644 --- a/src/cmd/profiling.rs +++ b/src/cmd/profiling.rs @@ -34,10 +34,10 @@ impl Tuner for MyTuner { fn main() { // Randomly generate an array of // 200_000_000 u64's with half shifted >> 32 and half shifted << 32 - let mut inputs = gen_inputs(200_000_000, 0u32); - let mut inputs_2 = gen_inputs(200_000_000, 0u32); + let mut inputs = gen_inputs(50_000_000, 0u128); + let mut inputs_2 = gen_inputs(50_000_000, 0u128); - // Input generation is multi-threaded and hard to differentiate from the actual + // Input generation is multithreaded and hard to differentiate from the actual // sorting algorithm, depending on the profiler. This makes it more obvious. sleep(Duration::from_millis(300)); diff --git a/src/lib.rs b/src/lib.rs index 3c97a1c..c33cc25 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -172,8 +172,6 @@ //! //! Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. -extern crate core; - mod radix_key; mod radix_key_impl; mod radix_sort_builder; diff --git a/src/radix_key_impl.rs b/src/radix_key_impl.rs index 05e88e0..d2fa29a 100644 --- a/src/radix_key_impl.rs +++ b/src/radix_key_impl.rs @@ -16,8 +16,12 @@ impl RadixKey for u16 { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } @@ -31,8 +35,12 @@ impl RadixKey for u32 { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } @@ -46,8 +54,12 @@ impl RadixKey for u64 { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } @@ -61,8 +73,12 @@ impl RadixKey for u128 { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } @@ -77,8 +93,12 @@ impl RadixKey for usize { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } @@ -93,8 +113,12 @@ impl RadixKey for usize { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } @@ -109,8 +133,12 @@ impl RadixKey for usize { fn get_level(&self, level: usize) -> u8 { debug_assert!(level < Self::LEVELS); - if cfg!(target_endian="little") { - unsafe { (self as *const Self as *const u8).wrapping_offset(level as isize).read() } + if cfg!(target_endian = "little") { + unsafe { + (self as *const Self as *const u8) + .wrapping_offset(level as isize) + .read() + } } else { (self >> (level * 8)) as u8 } From e454bd20acb728be375d3502850b23a98ca8e5dc Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Mon, 5 Feb 2024 13:15:46 +0900 Subject: [PATCH 07/24] Clean up dependencies and speed up benchmark compilation --- Cargo.lock | 146 ------------------------------------- Cargo.toml | 11 +-- benches/full_sort.rs | 22 ++---- benches/struct_sort.rs | 17 ----- src/cmd/timings.rs | 6 -- src/counts.rs | 45 ++++++------ src/radix_key_impl.rs | 14 ++-- src/sorts/scanning_sort.rs | 2 +- src/utils/bench_utils.rs | 12 +-- 9 files changed, 41 insertions(+), 234 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d04b4f3..23f999c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -87,12 +87,6 @@ dependencies = [ "nanorand", ] -[[package]] -name = "bumpalo" -version = "3.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" - [[package]] name = "cast" version = "0.3.0" @@ -191,7 +185,6 @@ dependencies = [ "num-traits", "once_cell", "oorandom", - "plotters", "rayon", "regex", "serde", @@ -322,15 +315,6 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" -[[package]] -name = "js-sys" -version = "0.3.67" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a1d36f1235bc969acba30b7f5990b864423a6068a10f7c90ae8f0112e3a59d1" -dependencies = [ - "wasm-bindgen", -] - [[package]] name = "lazy_static" version = "1.4.0" @@ -359,12 +343,6 @@ dependencies = [ "scopeguard", ] -[[package]] -name = "log" -version = "0.4.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" - [[package]] name = "memchr" version = "2.7.1" @@ -451,34 +429,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "947f833aaa585cf12b8ec7c0476c98784c49f33b861376ffc84ed92adebf2aba" -[[package]] -name = "plotters" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609" - -[[package]] -name = "plotters-svg" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab" -dependencies = [ - "plotters-backend", -] - [[package]] name = "proc-macro2" version = "1.0.78" @@ -523,13 +473,10 @@ version = "0.20.14" dependencies = [ "arbitrary-chunks", "block-pseudorand", - "bumpalo", "criterion", "dhat", "partition", "rayon", - "tikv-jemallocator", - "voracious_radix_sort", ] [[package]] @@ -670,26 +617,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" -[[package]] -name = "tikv-jemalloc-sys" -version = "0.5.4+5.3.0-patched" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1" -dependencies = [ - "cc", - "libc", -] - -[[package]] -name = "tikv-jemallocator" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca" -dependencies = [ - "libc", - "tikv-jemalloc-sys", -] - [[package]] name = "tinytemplate" version = "1.2.1" @@ -706,15 +633,6 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" -[[package]] -name = "voracious_radix_sort" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "446e7ffcb6c27a71d05af7e51ef2ee5b71c48424b122a832f2439651e1914899" -dependencies = [ - "rayon", -] - [[package]] name = "walkdir" version = "2.4.0" @@ -725,70 +643,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "wasm-bindgen" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1223296a201415c7fad14792dbefaace9bd52b62d33453ade1c5b5f07555406" -dependencies = [ - "cfg-if", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcdc935b63408d58a32f8cc9738a0bffd8f05cc7c002086c6ef20b7312ad9dcd" -dependencies = [ - "bumpalo", - "log", - "once_cell", - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e4c238561b2d428924c49815533a8b9121c664599558a5d9ec51f8a1740a999" -dependencies = [ - "quote", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bae1abb6806dc1ad9e560ed242107c0f6c84335f1749dd4e8ddb012ebd5e25a7" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.90" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d91413b1c31d7539ba5ef2451af3f0b833a005eb27a631cec32bc0635a8602b" - -[[package]] -name = "web-sys" -version = "0.3.67" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58cd2333b6e0be7a39605f0e255892fd7418a682d8da8fe042fe25128794d2ed" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index f0ec03d..3f69ca8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,22 +22,16 @@ timings = ["multi-threaded"] rayon = { version = "1.8", optional = true } arbitrary-chunks = "0.4.1" partition = "0.1.2" -bumpalo = { version = "3.14.0", features = ["collections"] } [dev-dependencies] rayon = "1.8" -criterion = "0.5.1" block-pseudorand = "0.1.2" dhat = "0.3.2" -[target.'cfg(all(not(target_env = "msvc"), tuning))'.dependencies] -tikv-jemallocator = "0.5.4" - # Workaround for reducing compile time when not tuning or benchmarking # Suggestions for a better alternative very welcome... [target.'cfg(any(bench, tuning))'.dependencies] -voracious_radix_sort = { version = "1.2", features = ["voracious_multithread"] } -criterion = "0.5.1" +criterion = { version = "0.5.1", default-features=false, features = ["rayon", "cargo_bench_support"] } block-pseudorand = "0.1.2" [profile.release] @@ -47,16 +41,19 @@ opt-level = 3 [[bench]] name = "basic_sort" harness = false +bench = false required-features = ["multi-threaded"] [[bench]] name = "full_sort" harness = false +bench = false required-features = ["multi-threaded"] [[bench]] name = "struct_sort" harness = false +bench = false required-features = ["multi-threaded"] [[bin]] diff --git a/benches/full_sort.rs b/benches/full_sort.rs index 806efae..c64e426 100644 --- a/benches/full_sort.rs +++ b/benches/full_sort.rs @@ -1,12 +1,14 @@ +#[cfg(not(bench))] +compile_error!("This binary must be run with `RUSTFLAGS='--cfg bench'`"); + use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rdst::utils::bench_utils::{bench_common, bench_medley}; use rdst::utils::test_utils::NumericTest; use rdst::RadixSort; -use voracious_radix_sort::{RadixKey as VorKey, RadixSort as Vor, Radixable}; fn full_sort_common(c: &mut Criterion, shift: T, name_suffix: &str) where - T: NumericTest + Radixable + VorKey, + T: NumericTest, { let tests: Vec<(&str, Box)>)> = vec![ ( @@ -23,13 +25,6 @@ where black_box(input); }), ), - ( - "voracious", - Box::new(|mut input| { - input.voracious_mt_sort(std::thread::available_parallelism().unwrap().get()); - black_box(input); - }), - ), ]; bench_common(c, shift, &("full_sort_".to_owned() + name_suffix), tests); @@ -37,7 +32,7 @@ where fn full_sort_medley_set(c: &mut Criterion, suffix: &str, shift: T) where - T: NumericTest + Radixable + VorKey, + T: NumericTest, { let tests: Vec<(&str, Box)>)> = vec![ ( @@ -54,13 +49,6 @@ where black_box(input); }), ), - ( - "voracious", - Box::new(|mut input| { - input.voracious_mt_sort(std::thread::available_parallelism().unwrap().get()); - black_box(input); - }), - ), ]; bench_medley(c, &("full_sort_medley_".to_owned() + suffix), tests, shift); diff --git a/benches/struct_sort.rs b/benches/struct_sort.rs index 276ed72..a96f67a 100644 --- a/benches/struct_sort.rs +++ b/benches/struct_sort.rs @@ -6,7 +6,6 @@ use criterion::{ use rdst::{RadixKey, RadixSort}; use std::cmp::Ordering; use std::time::Duration; -use voracious_radix_sort::{RadixSort as Vor, Radixable}; #[derive(Debug, Clone, Copy)] pub struct LargeStruct { @@ -40,15 +39,6 @@ impl PartialEq for LargeStruct { } } -impl Radixable for LargeStruct { - type Key = f32; - - #[inline] - fn key(&self) -> Self::Key { - self.sort_key - } -} - fn gen_input_t2d(n: usize) -> Vec { let mut data: Vec = block_rand((n / 10) * 9); data.radix_sort_unstable(); @@ -101,13 +91,6 @@ fn full_sort_struct(c: &mut Criterion) { black_box(input); }), ), - ( - "voracious", - Box::new(|mut input| { - input.voracious_sort(); - black_box(input); - }), - ), ( "sort", Box::new(|mut input| { diff --git a/src/cmd/timings.rs b/src/cmd/timings.rs index e5bb042..8b8c46c 100644 --- a/src/cmd/timings.rs +++ b/src/cmd/timings.rs @@ -24,12 +24,6 @@ compile_error!("This binary must be run with `RUSTFLAGS='--cfg tuning --cfg benc use rdst::utils::bench_utils::gen_bench_exponential_input_set; use rdst::{RadixKey, RadixSort}; use std::time::Instant; -#[cfg(all(tuning, not(target_env = "msvc")))] -use tikv_jemallocator::Jemalloc; - -#[cfg(all(tuning, not(target_env = "msvc")))] -#[global_allocator] -static ALLOC: Jemalloc = Jemalloc; fn print_row(data: Vec) { let mut first = true; diff --git a/src/counts.rs b/src/counts.rs index 23c2290..8ab7db2 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -1,9 +1,9 @@ use std::cell::RefCell; use std::ops::{Index, IndexMut}; +use std::ptr::copy_nonoverlapping; use crate::RadixKey; -use bumpalo::Bump; use std::rc::Rc; use std::slice::{Iter, SliceIndex}; @@ -65,7 +65,7 @@ pub struct CountMeta { struct ThreadContext { pub counter: RefCell, pub counts: RefCell>>>, - pub bump: Bump, + pub tmp: RefCell>, } impl CountManager { @@ -153,28 +153,29 @@ impl CountManager { T: Copy, F: FnMut(&CountManager, &mut [T], &mut [T]), { - let len = src_bucket.len(); Self::THREAD_CTX.with(|ct| { - let mut tmp = bumpalo::collections::Vec::with_capacity_in(len, &ct.bump); - - // Safety: Vec has the same capacity as the input size, and set_len is not called to set - // the full vec len until after all data has been initialized. Source data is Copy - // so a full copy of the source data is sufficient for initialization. - - // Note: This is done rather than something like extend because the performance - // is significantly better. Extend and co. use push and therefore increment len for each item - // where this simply copies all the data first then sets len once. - - // Existing data is used rather than just leaving it uninitialized until write because - // doing so is undefined behavior and MaybeUninit creates a bunch of code duplication and bloat. - // There's only a minor performance hit for using the copy instead of overwriting uninitialized data. - unsafe { - std::ptr::copy_nonoverlapping(src_bucket.as_ptr(), tmp.as_mut_ptr(), len); - tmp.set_len(len); + let byte_len = std::mem::size_of_val(src_bucket); + let mut t = ct.tmp.borrow_mut(); + + if t.len() < byte_len { + *t = Vec::with_capacity(byte_len); } - f(self, src_bucket, &mut tmp); - drop(tmp); - }) + + // Safety: The buffer is guaranteed to have enough capacity by the logic above. + // As the data is copied from the source buffer to the temporary buffer, and + // T is Copy, the data is therefore correctly initialized (assuming the source itself is). + // Len is set to 0 until the end to ensure that the compiler doesn't assume the buffer + // is fully initialized before that point. + let tmp = unsafe { + t.set_len(0); + let ptr = t.as_mut_ptr() as *mut T; + copy_nonoverlapping(src_bucket.as_ptr(), ptr, src_bucket.len()); + t.set_len(byte_len); + std::slice::from_raw_parts_mut(ptr, src_bucket.len()) + }; + + f(self, src_bucket, tmp); + }); } } diff --git a/src/radix_key_impl.rs b/src/radix_key_impl.rs index d2fa29a..c0f2d89 100644 --- a/src/radix_key_impl.rs +++ b/src/radix_key_impl.rs @@ -19,7 +19,7 @@ impl RadixKey for u16 { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { @@ -38,7 +38,7 @@ impl RadixKey for u32 { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { @@ -57,7 +57,7 @@ impl RadixKey for u64 { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { @@ -76,7 +76,7 @@ impl RadixKey for u128 { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { @@ -96,7 +96,7 @@ impl RadixKey for usize { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { @@ -116,7 +116,7 @@ impl RadixKey for usize { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { @@ -136,7 +136,7 @@ impl RadixKey for usize { if cfg!(target_endian = "little") { unsafe { (self as *const Self as *const u8) - .wrapping_offset(level as isize) + .wrapping_add(level) .read() } } else { diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index 6c3077c..31ea3a2 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -99,7 +99,7 @@ fn scanner_thread( ) where T: RadixKey + Copy, { - let mut stash: Vec> = vec![Vec::with_capacity(128); 256]; + let mut stash: Vec> = vec![Vec::new(); 256]; let mut finished_count = 0; let mut finished_map = [false; 256]; diff --git a/src/utils/bench_utils.rs b/src/utils/bench_utils.rs index 2ebf359..f04d057 100644 --- a/src/utils/bench_utils.rs +++ b/src/utils/bench_utils.rs @@ -6,25 +6,15 @@ pub fn gen_bench_input_set(shift: T) -> Vec> where T: NumericTest, { - let n = 200_000_000; + let n = 50_000_000; let half = n / 2; let inputs = gen_inputs(n, shift); // Middle values are used for the case where shift is provided let mut out = vec![ inputs[(half - 2_500)..(half + 2_500)].to_vec(), - inputs[(half - 5_000)..(half + 5_000)].to_vec(), inputs[(half - 25_000)..(half + 25_000)].to_vec(), - inputs[(half - 50_000)..(half + 50_000)].to_vec(), - inputs[(half - 100_000)..(half + 100_000)].to_vec(), - inputs[(half - 150_000)..(half + 150_000)].to_vec(), inputs[(half - 250_000)..(half + 250_000)].to_vec(), - inputs[(half - 500_000)..(half + 500_000)].to_vec(), - inputs[(half - 1_000_000)..(half + 1_000_000)].to_vec(), - inputs[(half - 2_500_000)..(half + 2_500_000)].to_vec(), - inputs[(half - 5_000_000)..(half + 5_000_000)].to_vec(), - inputs[(half - 25_000_000)..(half + 25_000_000)].to_vec(), - inputs[(half - 50_000_000)..(half + 50_000_000)].to_vec(), inputs, ]; From d2d2696154b2788038e6c8bc074aeab51548dc16 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Mon, 5 Feb 2024 17:11:10 +0900 Subject: [PATCH 08/24] Optimize count_into --- Cargo.lock | 8 ++-- Cargo.toml | 2 +- src/cmd/profiling.rs | 8 +++- src/counts.rs | 94 ++++++++++++++++++---------------------- src/sorts/mt_lsb_sort.rs | 6 +-- src/utils/sort_utils.rs | 2 +- 6 files changed, 58 insertions(+), 62 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 23f999c..a9f1d1f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -237,9 +237,9 @@ checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" [[package]] name = "dhat" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f2aaf837aaf456f6706cb46386ba8dffd4013a757e36f4ea05c20dd46b209a3" +checksum = "98cd11d84628e233de0ce467de10b8633f4ddaecafadefc86e13b84b8739b827" dependencies = [ "backtrace", "lazy_static", @@ -351,9 +351,9 @@ checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] diff --git a/Cargo.toml b/Cargo.toml index 3f69ca8..2bac6d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,7 @@ partition = "0.1.2" [dev-dependencies] rayon = "1.8" block-pseudorand = "0.1.2" -dhat = "0.3.2" +dhat = "0.3.3" # Workaround for reducing compile time when not tuning or benchmarking # Suggestions for a better alternative very welcome... diff --git a/src/cmd/profiling.rs b/src/cmd/profiling.rs index de0f3db..d0e908c 100644 --- a/src/cmd/profiling.rs +++ b/src/cmd/profiling.rs @@ -41,12 +41,16 @@ fn main() { // sorting algorithm, depending on the profiler. This makes it more obvious. sleep(Duration::from_millis(300)); - inputs.radix_sort_builder().with_tuner(&MyTuner {}).sort(); + inputs.radix_sort_builder() + .with_tuner(&MyTuner {}) + .sort(); // A second run, for comparison sleep(Duration::from_millis(300)); let time = Instant::now(); - inputs_2.radix_sort_builder().with_tuner(&MyTuner {}).sort(); + inputs_2.radix_sort_builder() + .with_tuner(&MyTuner {}) + .sort(); let e = time.elapsed().as_millis(); println!("Elapsed: {}ms", e); diff --git a/src/counts.rs b/src/counts.rs index 8ab7db2..2bfb66a 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -73,22 +73,22 @@ impl CountManager { static THREAD_CTX: ThreadContext = Default::default(); } - #[inline(always)] + #[inline(never)] pub fn get_empty_counts(&self) -> Rc> { - if let Some(counts) = Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().pop()) { - counts - } else { - Default::default() - } + Self::THREAD_CTX + .with(|ct| ct + .counts + .borrow_mut() + .pop() + .unwrap_or(Default::default())) } - #[inline(always)] + #[inline(never)] pub fn return_counts(&self, counts: Rc>) { counts.borrow_mut().clear(); Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().push(counts)); } - #[inline(always)] pub fn count_into( &self, counts: &mut Counts, @@ -182,7 +182,7 @@ impl CountManager { impl Counter { #[inline(always)] fn clear(&mut self) { - self.0.iter_mut().for_each(|x| *x = 0); + self.0.fill(0) } #[inline(always)] @@ -197,38 +197,34 @@ impl Counter { println!("({}) COUNT", level); self.clear(); - counts.clear(); + meta.already_sorted = true; if bucket.is_empty() { - meta.first = 0; - meta.last = 0; - meta.already_sorted = true; return; } else if bucket.len() == 1 { let b = bucket[0].get_level(level) as usize; - counts[b] += 1; + counts[b] = 1; meta.first = b as u8; meta.last = b as u8; - meta.already_sorted = true; return; } - let mut already_sorted = true; - let first = bucket.first().unwrap().get_level(level); - let last = bucket.last().unwrap().get_level(level); + meta.first = unsafe { bucket.get_unchecked(0).get_level(level) }; + meta.last = unsafe { bucket.get_unchecked(bucket.len() - 1).get_level(level) }; - let mut continue_from = bucket.len(); + let mut continue_from = 0; let mut prev = 0usize; // First, count directly into the output buffer until we find a value that is out of order. - for (i, item) in bucket.iter().enumerate() { + for item in bucket { let b = item.get_level(level) as usize; - counts[b] += 1; + unsafe { *self.0.get_unchecked_mut(b*4) += 1 } + + continue_from += 1; if b < prev { - continue_from = i + 1; - already_sorted = false; + meta.already_sorted = false; break; } @@ -236,52 +232,48 @@ impl Counter { } if continue_from == bucket.len() { - meta.first = first; - meta.last = last; - meta.already_sorted = already_sorted; return; } let chunks = bucket[continue_from..].chunks_exact(4); let rem = chunks.remainder(); - chunks.into_iter().for_each(|chunk| { - let a = chunk[0].get_level(level) as usize; - let b = chunk[1].get_level(level) as usize; - let c = chunk[2].get_level(level) as usize; - let d = chunk[3].get_level(level) as usize; - - self.0[a * 4] += 1; - self.0[1 + b * 4] += 1; - self.0[2 + c * 4] += 1; - self.0[3 + d * 4] += 1; + chunks.for_each(|chunk| unsafe { + let a = chunk.get_unchecked(0).get_level(level) as usize * 4; + let b = chunk.get_unchecked(1).get_level(level) as usize * 4 + 1; + let c = chunk.get_unchecked(2).get_level(level) as usize * 4 + 2; + let d = chunk.get_unchecked(3).get_level(level) as usize * 4 + 4; + + debug_assert!(a < 1024); + debug_assert!(b < 1024); + debug_assert!(c < 1024); + debug_assert!(d < 1024); + + *self.0.get_unchecked_mut(a) += 1; + *self.0.get_unchecked_mut(b) += 1; + *self.0.get_unchecked_mut(c) += 1; + *self.0.get_unchecked_mut(d) += 1; }); - rem.iter().for_each(|v| { - let b = v.get_level(level) as usize; - counts[b] += 1; + rem.into_iter().for_each(|v| unsafe { + let b = v.get_level(level) as usize * 4; + *self.0.get_unchecked_mut(b) += 1; }); for i in 0..256 { - let agg = self.0[i * 4] + self.0[1 + i * 4] + self.0[2 + i * 4] + self.0[3 + i * 4]; - counts[i] += agg; - } + let a = i * 4; - meta.first = first; - meta.last = last; - meta.already_sorted = already_sorted; + unsafe { + *counts.0.get_unchecked_mut(i) = *self.0.get_unchecked(a) + *self.0.get_unchecked(a + 1) + *self.0.get_unchecked(a + 2) + *self.0.get_unchecked(a + 3); + } + } } } impl Counts { #[inline(always)] pub fn clear(&mut self) { - self.0.iter_mut().for_each(|x| *x = 0); - } - - #[inline(always)] - pub fn new() -> Self { - Self::default() + self.0.fill(0); } #[inline] diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index a6526a7..d9f31c5 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -151,15 +151,15 @@ impl<'a> Sorter<'a> { return; } - self.cm.with_tmp_buffer(bucket, |_, bucket, tmp_bucket| { + self.cm.with_tmp_buffer(bucket, |cm, bucket, tmp_bucket| { let levels: Vec = (start_level..=end_level).collect(); let mut invert = false; for level in levels { let (tile_counts, already_sorted) = if invert { - get_tile_counts(&self.cm, tmp_bucket, tile_size, level) + get_tile_counts(cm, tmp_bucket, tile_size, level) } else { - get_tile_counts(&self.cm, bucket, tile_size, level) + get_tile_counts(cm, bucket, tile_size, level) }; if already_sorted { diff --git a/src/utils/sort_utils.rs b/src/utils/sort_utils.rs index d6b6795..ad3a2b3 100644 --- a/src/utils/sort_utils.rs +++ b/src/utils/sort_utils.rs @@ -24,7 +24,7 @@ where println!("({}) TILE_COUNT", level); let num_tiles = cdiv(bucket.len(), tile_size); - let mut tiles: Vec = vec![Counts::new(); num_tiles]; + let mut tiles: Vec = vec![Counts::default(); num_tiles]; let mut meta: Vec = vec![CountMeta::default(); num_tiles]; #[cfg(feature = "multi-threaded")] From d8500f04bac5cd5d4b9d1ff344410a9159e8da11 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Mon, 5 Feb 2024 18:09:55 +0900 Subject: [PATCH 09/24] Fix count_into --- src/counts.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.rs b/src/counts.rs index 2bfb66a..e855006 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -242,7 +242,7 @@ impl Counter { let a = chunk.get_unchecked(0).get_level(level) as usize * 4; let b = chunk.get_unchecked(1).get_level(level) as usize * 4 + 1; let c = chunk.get_unchecked(2).get_level(level) as usize * 4 + 2; - let d = chunk.get_unchecked(3).get_level(level) as usize * 4 + 4; + let d = chunk.get_unchecked(3).get_level(level) as usize * 4 + 3; debug_assert!(a < 1024); debug_assert!(b < 1024); From 4a5f169b9522bada0c4cc51fba442900c542bdc9 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 14:07:50 +0900 Subject: [PATCH 10/24] Simplify flag spaghetti for running tests, benchmarks etc. Previously, it was possible to run tests without compiling criterion and a bunch of transitive dependencies. But it was a pain to deal with. This change increases compile time when running cargo test, but drastically simplifies the configuration. --- Cargo.lock | 247 ++------------------------ Cargo.toml | 31 +--- benches/basic_sort.rs | 6 +- {src/utils => benches}/bench_utils.rs | 57 +++++- benches/full_sort.rs | 7 +- {src/cmd => scripts}/profiling.rs | 74 ++++++-- {src/cmd => scripts}/timings.rs | 106 +++++++++-- src/lib.rs | 13 +- src/radix_sort.rs | 2 +- src/sorts/comparative_sort.rs | 2 +- src/sorts/lsb_sort.rs | 2 +- src/sorts/mt_lsb_sort.rs | 2 +- src/sorts/recombinating_sort.rs | 2 +- src/sorts/regions_sort.rs | 2 +- src/sorts/scanning_sort.rs | 2 +- src/sorts/ska_sort.rs | 2 +- src/{utils => }/test_utils.rs | 34 ++-- src/{utils/sort_utils.rs => utils.rs} | 0 src/utils/mod.rs | 8 - 19 files changed, 266 insertions(+), 333 deletions(-) rename {src/utils => benches}/bench_utils.rs (79%) rename {src/cmd => scripts}/profiling.rs (57%) mode change 100644 => 100755 rename {src/cmd => scripts}/timings.rs (62%) mode change 100644 => 100755 rename src/{utils => }/test_utils.rs (96%) rename src/{utils/sort_utils.rs => utils.rs} (100%) delete mode 100644 src/utils/mod.rs diff --git a/Cargo.lock b/Cargo.lock index a9f1d1f..5ef803e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,21 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 - -[[package]] -name = "addr2line" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +version = 4 [[package]] name = "aho-corasick" @@ -50,27 +35,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" -[[package]] -name = "backtrace" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - [[package]] name = "bitflags" version = "2.4.2" @@ -93,15 +57,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" -[[package]] -name = "cc" -version = "1.0.83" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" -dependencies = [ - "libc", -] - [[package]] name = "cfg-if" version = "1.0.0" @@ -235,22 +190,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" -[[package]] -name = "dhat" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cd11d84628e233de0ce467de10b8633f4ddaecafadefc86e13b84b8739b827" -dependencies = [ - "backtrace", - "lazy_static", - "mintex", - "parking_lot", - "rustc-hash", - "serde", - "serde_json", - "thousands", -] - [[package]] name = "either" version = "1.9.0" @@ -267,12 +206,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "gimli" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" - [[package]] name = "half" version = "2.3.1" @@ -315,12 +248,6 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" version = "0.2.153" @@ -333,37 +260,12 @@ version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" -[[package]] -name = "lock_api" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" -dependencies = [ - "autocfg", - "scopeguard", -] - [[package]] name = "memchr" version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" -[[package]] -name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "mintex" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bec4598fddb13cc7b528819e697852653252b760f1228b7642679bf2ff2cd07" - [[package]] name = "nanorand" version = "0.6.1" @@ -379,15 +281,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "object" -version = "0.32.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" -dependencies = [ - "memchr", -] - [[package]] name = "once_cell" version = "1.19.0" @@ -400,29 +293,6 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.48.5", -] - [[package]] name = "partition" version = "0.1.2" @@ -474,20 +344,10 @@ dependencies = [ "arbitrary-chunks", "block-pseudorand", "criterion", - "dhat", "partition", "rayon", ] -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "regex" version = "1.10.3" @@ -517,25 +377,13 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" -[[package]] -name = "rustc-demangle" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" - -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustix" version = "0.38.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" dependencies = [ - "bitflags 2.4.2", + "bitflags", "errno", "libc", "linux-raw-sys", @@ -557,12 +405,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - [[package]] name = "serde" version = "1.0.196" @@ -594,12 +436,6 @@ dependencies = [ "serde", ] -[[package]] -name = "smallvec" -version = "1.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" - [[package]] name = "syn" version = "2.0.48" @@ -611,12 +447,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "thousands" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" - [[package]] name = "tinytemplate" version = "1.2.1" @@ -680,22 +510,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -704,93 +519,51 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.0" diff --git a/Cargo.toml b/Cargo.toml index 2bac6d3..663bbed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ homepage = "https://github.com/Nessex/rdst" categories = ["algorithms"] keywords = ["radix","sort","rayon","parallel","multithreaded"] documentation = "https://docs.rs/rdst/" +resolver = "2" [features] default = ["multi-threaded"] @@ -24,15 +25,8 @@ arbitrary-chunks = "0.4.1" partition = "0.1.2" [dev-dependencies] -rayon = "1.8" block-pseudorand = "0.1.2" -dhat = "0.3.3" - -# Workaround for reducing compile time when not tuning or benchmarking -# Suggestions for a better alternative very welcome... -[target.'cfg(any(bench, tuning))'.dependencies] criterion = { version = "0.5.1", default-features=false, features = ["rayon", "cargo_bench_support"] } -block-pseudorand = "0.1.2" [profile.release] codegen-units = 1 @@ -41,31 +35,14 @@ opt-level = 3 [[bench]] name = "basic_sort" harness = false -bench = false -required-features = ["multi-threaded"] +test = false [[bench]] name = "full_sort" harness = false -bench = false -required-features = ["multi-threaded"] +test = false [[bench]] name = "struct_sort" harness = false -bench = false -required-features = ["multi-threaded"] - -[[bin]] -# Requires: RUSTFLAGS="--cfg bench --cfg tuning" AND --features profiling -# Suggestions for a better alternative very welcome... -name = "profiling" -path = "src/cmd/profiling.rs" -required-features = ["profiling"] - -[[bin]] -# Requires: RUSTFLAGS="--cfg bench --cfg tuning" AND --features timings -# Suggestions for a better alternative very welcome... -name = "timings" -path = "src/cmd/timings.rs" -required-features = ["timings"] +test = false diff --git a/benches/basic_sort.rs b/benches/basic_sort.rs index 138130e..d6f3858 100644 --- a/benches/basic_sort.rs +++ b/benches/basic_sort.rs @@ -1,6 +1,8 @@ +mod bench_utils; + use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use rdst::utils::bench_utils::bench_single; -use rdst::utils::test_utils::NumericTest; +use bench_utils::bench_single; +use bench_utils::NumericTest; use rdst::RadixSort; fn basic_sort_set(c: &mut Criterion, suffix: &str, shift: T, count: usize) diff --git a/src/utils/bench_utils.rs b/benches/bench_utils.rs similarity index 79% rename from src/utils/bench_utils.rs rename to benches/bench_utils.rs index f04d057..ab806f8 100644 --- a/src/utils/bench_utils.rs +++ b/benches/bench_utils.rs @@ -1,7 +1,58 @@ -use crate::utils::test_utils::{gen_inputs, NumericTest}; +use rayon::prelude::*; +use std::fmt::Debug; +use std::ops::{Shl, ShlAssign, Shr, ShrAssign}; use criterion::{AxisScale, BatchSize, BenchmarkId, Criterion, PlotConfiguration, Throughput}; use std::time::Duration; +use block_pseudorand::block_rand; +use rayon::iter::IntoParallelRefMutIterator; +use rdst::RadixKey; + +pub trait NumericTest: +RadixKey ++ Sized ++ Copy ++ Debug ++ PartialEq ++ Ord ++ Send ++ Sync ++ Shl ++ Shr ++ ShrAssign ++ ShlAssign +{ +} + +impl NumericTest for T where + T: RadixKey + + Sized + + Copy + + Debug + + PartialEq + + Ord + + Send + + Sync + + Shl + + Shr + + ShrAssign + + ShlAssign +{ +} + +#[allow(dead_code)] +pub fn gen_inputs(n: usize, shift: T) -> Vec +where + T: NumericTest, +{ + let mut inputs: Vec = block_rand(n); + + inputs[0..(n / 2)].par_iter_mut().for_each(|v| *v >>= shift); + inputs[(n / 2)..n].par_iter_mut().for_each(|v| *v <<= shift); + + inputs +} +#[allow(dead_code)] pub fn gen_bench_input_set(shift: T) -> Vec> where T: NumericTest, @@ -23,6 +74,7 @@ where out } +#[allow(dead_code)] pub fn gen_bench_exponential_input_set(shift: T) -> Vec> where T: NumericTest, @@ -47,6 +99,7 @@ where out } +#[allow(dead_code)] pub fn bench_common( c: &mut Criterion, shift: T, @@ -77,6 +130,7 @@ pub fn bench_common( group.finish(); } +#[allow(dead_code)] pub fn bench_medley( c: &mut Criterion, group: &str, @@ -111,6 +165,7 @@ pub fn bench_medley( group.finish(); } +#[allow(dead_code)] pub fn bench_single( c: &mut Criterion, group: &str, diff --git a/benches/full_sort.rs b/benches/full_sort.rs index c64e426..4bffa7f 100644 --- a/benches/full_sort.rs +++ b/benches/full_sort.rs @@ -1,9 +1,8 @@ -#[cfg(not(bench))] -compile_error!("This binary must be run with `RUSTFLAGS='--cfg bench'`"); +mod bench_utils; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use rdst::utils::bench_utils::{bench_common, bench_medley}; -use rdst::utils::test_utils::NumericTest; +use bench_utils::{bench_common, bench_medley}; +use bench_utils::NumericTest; use rdst::RadixSort; fn full_sort_common(c: &mut Criterion, shift: T, name_suffix: &str) diff --git a/src/cmd/profiling.rs b/scripts/profiling.rs old mode 100644 new mode 100755 similarity index 57% rename from src/cmd/profiling.rs rename to scripts/profiling.rs index d0e908c..c2a8d3f --- a/src/cmd/profiling.rs +++ b/scripts/profiling.rs @@ -1,19 +1,71 @@ -/// NOTE: The primary use-case for this example is for running a large sort with cargo-instruments. -/// It must be run with `--features=tuning`. -/// -/// e.g. -/// ``` -/// RUSTFLAGS='--cfg bench --cfg tuning -g -C opt-level=3 -C force-frame-pointers=y -C target-cpu=apple-m1 -C target-feature=+neon' cargo +nightly instruments -t time --bin profiling --features profiling -/// ``` +#!/usr/bin/env -S cargo +nightly -Zscript +--- +[package] +edition = "2024" -#[cfg(not(all(tuning, bench)))] -compile_error!("This binary must be run with `RUSTFLAGS='--cfg tuning --cfg bench'`"); +[dependencies] +block-pseudorand = "0.1.2" +rayon = "1.10" +rdst = { path = "../" } +[profile.dev] +codegen-units = 1 +opt-level = 3 +debug = false +--- + +use rayon::prelude::*; +use std::fmt::Debug; +use std::ops::{Shl, ShlAssign, Shr, ShrAssign}; use rdst::tuner::{Algorithm, Tuner, TuningParams}; -use rdst::utils::test_utils::gen_inputs; -use rdst::RadixSort; +use rdst::{RadixKey, RadixSort}; use std::thread::sleep; use std::time::{Duration, Instant}; +use block_pseudorand::block_rand; + +pub trait NumericTest: +RadixKey ++ Sized ++ Copy ++ Debug ++ PartialEq ++ Ord ++ Send ++ Sync ++ Shl ++ Shr ++ ShrAssign ++ ShlAssign +{ +} + +impl NumericTest for T where + T: RadixKey + + Sized + + Copy + + Debug + + PartialEq + + Ord + + Send + + Sync + + Shl + + Shr + + ShrAssign + + ShlAssign +{ +} + +fn gen_inputs(n: usize, shift: T) -> Vec +where + T: NumericTest, +{ + let mut inputs: Vec = block_rand(n); + + inputs[0..(n / 2)].par_iter_mut().for_each(|v| *v >>= shift); + inputs[(n / 2)..n].par_iter_mut().for_each(|v| *v <<= shift); + + inputs +} struct MyTuner {} diff --git a/src/cmd/timings.rs b/scripts/timings.rs old mode 100644 new mode 100755 similarity index 62% rename from src/cmd/timings.rs rename to scripts/timings.rs index 8b8c46c..ce8e826 --- a/src/cmd/timings.rs +++ b/scripts/timings.rs @@ -1,3 +1,19 @@ +#!/usr/bin/env -S cargo +nightly -Zscript +--- +[package] +edition = "2024" + +[dependencies] +block-pseudorand = "0.1.2" +rayon = "1.10" +rdst = { path = "../" } + +[profile.dev] +codegen-units = 1 +opt-level = 3 +debug = false +--- + //! # timings //! //! This is used to run the sorting algorithm across a medley of inputs and output the results @@ -9,7 +25,7 @@ //! You may need to tweak the command below for your own machine. //! //! ``` -//! RUSTFLAGS='--cfg bench --cfg tuning -C opt-level=3 -C target-cpu=apple-m1 -C target-feature=+neon' cargo +nightly run --bin timings --features timings -- 1234 "Hello world" +//! RUSTFLAGS='-C target-cpu=apple-m1 -C target-feature=+neon' ./timings.rs 1234 "Hello world" //! ``` //! //! - `1234` is where you place the ID for your run. If you are just running a brief test this can be `N/A`, otherwise it should be something like a commit SHA that you can use to find the code for this run again. @@ -18,12 +34,80 @@ #![feature(string_remove_matches)] -#[cfg(not(all(tuning, bench)))] -compile_error!("This binary must be run with `RUSTFLAGS='--cfg tuning --cfg bench'`"); - -use rdst::utils::bench_utils::gen_bench_exponential_input_set; +use rayon::prelude::*; +use std::fmt::Debug; +use std::ops::{Shl, ShlAssign, Shr, ShrAssign}; use rdst::{RadixKey, RadixSort}; use std::time::Instant; +use block_pseudorand::block_rand; + +pub trait NumericTest: +RadixKey ++ Sized ++ Copy ++ Debug ++ PartialEq ++ Ord ++ Send ++ Sync ++ Shl ++ Shr ++ ShrAssign ++ ShlAssign +{ +} + +impl NumericTest for T where + T: RadixKey + + Sized + + Copy + + Debug + + PartialEq + + Ord + + Send + + Sync + + Shl + + Shr + + ShrAssign + + ShlAssign +{ +} + +fn gen_inputs(n: usize, shift: T) -> Vec +where + T: NumericTest, +{ + let mut inputs: Vec = block_rand(n); + + inputs[0..(n / 2)].par_iter_mut().for_each(|v| *v >>= shift); + inputs[(n / 2)..n].par_iter_mut().for_each(|v| *v <<= shift); + + inputs +} + +fn gen_exponential_input_set(shift: T) -> Vec> +where + T: NumericTest, +{ + let n = 200_000_000; + let inputs = gen_inputs(n, shift); + let mut len = inputs.len(); + let mut out = Vec::new(); + + loop { + let start = (inputs.len() - len) / 2; + let end = start + len; + + out.push(inputs[start..end].to_vec()); + + len = len / 2; + if len == 0 { + break; + } + } + + out +} fn print_row(data: Vec) { let mut first = true; @@ -89,22 +173,22 @@ fn main() { assert_eq!(out.len(), 2); let mut headers = vec!["id".to_string(), "description".to_string()]; - let inputs = gen_bench_exponential_input_set(0u32); + let inputs = gen_exponential_input_set(0u32); bench(inputs, "u32", &mut out, &mut headers); - let inputs = gen_bench_exponential_input_set(16u32); + let inputs = gen_exponential_input_set(16u32); bench(inputs, "u32_bimodal", &mut out, &mut headers); - let inputs = gen_bench_exponential_input_set(0u64); + let inputs = gen_exponential_input_set(0u64); bench(inputs, "u64", &mut out, &mut headers); - let inputs = gen_bench_exponential_input_set(32u64); + let inputs = gen_exponential_input_set(32u64); bench(inputs, "u64_bimodal", &mut out, &mut headers); - let inputs = gen_bench_exponential_input_set(0u128); + let inputs = gen_exponential_input_set(0u128); bench(inputs, "u128", &mut out, &mut headers); - let inputs = gen_bench_exponential_input_set(64u128); + let inputs = gen_exponential_input_set(64u128); bench(inputs, "u128_bimodal", &mut out, &mut headers); if print_headers { diff --git a/src/lib.rs b/src/lib.rs index c33cc25..e66fe12 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -176,24 +176,23 @@ mod radix_key; mod radix_key_impl; mod radix_sort_builder; -#[cfg(not(any(test, bench)))] +#[cfg(not(any(test, debug_assertions)))] mod sorts; -#[cfg(any(test, bench))] +#[cfg(any(test, debug_assertions))] pub mod sorts; -#[cfg(not(any(test, bench, tuning)))] mod utils; -#[cfg(any(test, bench, tuning))] -pub mod utils; - +#[cfg(test)] +pub mod test_utils; mod radix_sort; mod sorter; mod tuners; -// Public modules + pub mod counts; pub mod tuner; + // Public exports pub use radix_key::RadixKey; pub use radix_sort::RadixSort; diff --git a/src/radix_sort.rs b/src/radix_sort.rs index 1891cbc..7091d84 100644 --- a/src/radix_sort.rs +++ b/src/radix_sort.rs @@ -47,7 +47,7 @@ where #[cfg(test)] mod tests { use crate::tuner::{Algorithm, Tuner, TuningParams}; - use crate::utils::test_utils::{sort_comparison_suite, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{sort_comparison_suite, NumericTest, SingleAlgoTuner}; use crate::RadixSort; use block_pseudorand::block_rand; use std::cmp::Ordering; diff --git a/src/sorts/comparative_sort.rs b/src/sorts/comparative_sort.rs index 5bc2641..0d47d90 100644 --- a/src/sorts/comparative_sort.rs +++ b/src/sorts/comparative_sort.rs @@ -55,7 +55,7 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::RadixKey; diff --git a/src/sorts/lsb_sort.rs b/src/sorts/lsb_sort.rs index c3a0f0b..c12133b 100644 --- a/src/sorts/lsb_sort.rs +++ b/src/sorts/lsb_sort.rs @@ -195,7 +195,7 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::RadixKey; diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index d9f31c5..6239225 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -221,7 +221,7 @@ mod tests { use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::utils::cdiv; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::RadixKey; diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index acb187c..21a7bf4 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -119,7 +119,7 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::utils::{aggregate_tile_counts, cdiv, get_tile_counts}; diff --git a/src/sorts/regions_sort.rs b/src/sorts/regions_sort.rs index eb216ef..e1401ec 100644 --- a/src/sorts/regions_sort.rs +++ b/src/sorts/regions_sort.rs @@ -313,7 +313,7 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::utils::{aggregate_tile_counts, cdiv, get_tile_counts}; diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index 31ea3a2..c7bd8a6 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -277,7 +277,7 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::RadixKey; diff --git a/src/sorts/ska_sort.rs b/src/sorts/ska_sort.rs index df0eea5..fb59df5 100644 --- a/src/sorts/ska_sort.rs +++ b/src/sorts/ska_sort.rs @@ -129,7 +129,7 @@ mod tests { use crate::sorter::Sorter; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::test_utils::{ + use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; use crate::RadixKey; diff --git a/src/utils/test_utils.rs b/src/test_utils.rs similarity index 96% rename from src/utils/test_utils.rs rename to src/test_utils.rs index 39de5aa..f7dc949 100644 --- a/src/utils/test_utils.rs +++ b/src/test_utils.rs @@ -6,7 +6,23 @@ use std::fmt::Debug; use std::ops::{Shl, ShlAssign, Shr, ShrAssign}; pub trait NumericTest: - RadixKey +RadixKey ++ Sized ++ Copy ++ Debug ++ PartialEq ++ Ord ++ Send ++ Sync ++ Shl ++ Shr ++ ShrAssign ++ ShlAssign +{ +} + +impl NumericTest for T where + T: RadixKey + Sized + Copy + Debug @@ -21,22 +37,6 @@ pub trait NumericTest: { } -impl NumericTest for T where - T: RadixKey - + Sized - + Copy - + Debug - + PartialEq - + Ord - + Send - + Sync - + Shl - + Shr - + ShrAssign - + ShlAssign -{ -} - pub struct SingleAlgoTuner { pub(crate) algo: Algorithm, } diff --git a/src/utils/sort_utils.rs b/src/utils.rs similarity index 100% rename from src/utils/sort_utils.rs rename to src/utils.rs diff --git a/src/utils/mod.rs b/src/utils/mod.rs deleted file mode 100644 index 209f379..0000000 --- a/src/utils/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -#[cfg(all(feature = "multi-threaded", any(test, bench, tuning)))] -pub mod bench_utils; -#[cfg(all(feature = "multi-threaded", any(test, bench, tuning)))] -pub mod test_utils; - -mod sort_utils; - -pub use sort_utils::*; From cd2ff081ece2d8516dbc466f4edb370a2b09cd33 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:00:30 +0900 Subject: [PATCH 11/24] Bump rayon@1.10.0 --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ef803e..664fae2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -319,9 +319,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.8.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", diff --git a/Cargo.toml b/Cargo.toml index 663bbed..7bdb851 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ profiling = ["multi-threaded"] timings = ["multi-threaded"] [dependencies] -rayon = { version = "1.8", optional = true } +rayon = { version = "1.10", optional = true } arbitrary-chunks = "0.4.1" partition = "0.1.2" From 923e93c6d90fab4712666f852f2e54d4f9fe503d Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:03:37 +0900 Subject: [PATCH 12/24] Remove internal features profiling and timings --- Cargo.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7bdb851..d95a1e6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,8 +16,6 @@ resolver = "2" default = ["multi-threaded"] multi-threaded = ["rayon"] work_profiles = [] -profiling = ["multi-threaded"] -timings = ["multi-threaded"] [dependencies] rayon = { version = "1.10", optional = true } From 010b30465c726c43d7b87c4c1ea8c0b3b64dfb90 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:05:38 +0900 Subject: [PATCH 13/24] Tidy up .gitignore --- .gitignore | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 06b4040..7bed0d8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,4 @@ target/ -Cargo.lock -monte-carlo.tsv -experiments.csv .idea +.jj/ fixture/Cargo.lock From ddaacdbb5efa24206ab05293fe80c39f55063907 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:11:52 +0900 Subject: [PATCH 14/24] Remove unused cfg --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e66fe12..aae0f58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -176,9 +176,9 @@ mod radix_key; mod radix_key_impl; mod radix_sort_builder; -#[cfg(not(any(test, debug_assertions)))] +#[cfg(not(test))] mod sorts; -#[cfg(any(test, debug_assertions))] +#[cfg(test)] pub mod sorts; mod utils; From 671faf3680f6bee13d16e8910ef8e8c41dbc281c Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:11:52 +0900 Subject: [PATCH 15/24] Add gh actions for running examples, fixtures and scripts --- .github/workflows/rust.yml | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 28bc94e..7f88b0c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout sources - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install nightly toolchain with clippy available uses: actions-rs/toolchain@v1 with: @@ -35,6 +35,39 @@ jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Run tests run: cargo test --verbose --release + + test-fixture: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run fixture + working-directory: fixture + run: cargo run --release + + test-profiling: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run profiling script + run: ./scripts/profiling.rs + + test-timings: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run timings script + run: ./scripts/timings.rs + + test-examples: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run simple_usage + run: cargo run --example simple_usage + - name: Run single_threaded + run: cargo run --example single_threaded + - name: Run custom_tuner + run: cargo run --example custom_tuner \ No newline at end of file From b7da6c6dfc368d1b482dfe157a9b006e4e7b8e3d Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:17:00 +0900 Subject: [PATCH 16/24] Run cargo fmt --- benches/basic_sort.rs | 2 +- benches/bench_utils.rs | 42 ++++++++++++++++----------------- benches/full_sort.rs | 4 ++-- src/counts.rs | 14 +++++------ src/lib.rs | 8 +++---- src/radix_sort.rs | 2 +- src/sorts/comparative_sort.rs | 4 ++-- src/sorts/lsb_sort.rs | 4 ++-- src/sorts/mt_lsb_sort.rs | 6 ++--- src/sorts/recombinating_sort.rs | 4 ++-- src/sorts/regions_sort.rs | 4 ++-- src/sorts/scanning_sort.rs | 4 ++-- src/sorts/ska_sort.rs | 4 ++-- src/test_utils.rs | 34 +++++++++++++------------- 14 files changed, 66 insertions(+), 70 deletions(-) diff --git a/benches/basic_sort.rs b/benches/basic_sort.rs index d6f3858..c7f7386 100644 --- a/benches/basic_sort.rs +++ b/benches/basic_sort.rs @@ -1,8 +1,8 @@ mod bench_utils; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; use bench_utils::bench_single; use bench_utils::NumericTest; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rdst::RadixSort; fn basic_sort_set(c: &mut Criterion, suffix: &str, shift: T, count: usize) diff --git a/benches/bench_utils.rs b/benches/bench_utils.rs index ab806f8..c3c0357 100644 --- a/benches/bench_utils.rs +++ b/benches/bench_utils.rs @@ -1,30 +1,14 @@ +use block_pseudorand::block_rand; +use criterion::{AxisScale, BatchSize, BenchmarkId, Criterion, PlotConfiguration, Throughput}; +use rayon::iter::IntoParallelRefMutIterator; use rayon::prelude::*; +use rdst::RadixKey; use std::fmt::Debug; use std::ops::{Shl, ShlAssign, Shr, ShrAssign}; -use criterion::{AxisScale, BatchSize, BenchmarkId, Criterion, PlotConfiguration, Throughput}; use std::time::Duration; -use block_pseudorand::block_rand; -use rayon::iter::IntoParallelRefMutIterator; -use rdst::RadixKey; pub trait NumericTest: -RadixKey -+ Sized -+ Copy -+ Debug -+ PartialEq -+ Ord -+ Send -+ Sync -+ Shl -+ Shr -+ ShrAssign -+ ShlAssign -{ -} - -impl NumericTest for T where - T: RadixKey + RadixKey + Sized + Copy + Debug @@ -39,6 +23,22 @@ impl NumericTest for T where { } +impl NumericTest for T where + T: RadixKey + + Sized + + Copy + + Debug + + PartialEq + + Ord + + Send + + Sync + + Shl + + Shr + + ShrAssign + + ShlAssign +{ +} + #[allow(dead_code)] pub fn gen_inputs(n: usize, shift: T) -> Vec where diff --git a/benches/full_sort.rs b/benches/full_sort.rs index 4bffa7f..33b6d64 100644 --- a/benches/full_sort.rs +++ b/benches/full_sort.rs @@ -1,8 +1,8 @@ mod bench_utils; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use bench_utils::{bench_common, bench_medley}; use bench_utils::NumericTest; +use bench_utils::{bench_common, bench_medley}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; use rdst::RadixSort; fn full_sort_common(c: &mut Criterion, shift: T, name_suffix: &str) diff --git a/src/counts.rs b/src/counts.rs index e855006..dd05285 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -75,12 +75,7 @@ impl CountManager { #[inline(never)] pub fn get_empty_counts(&self) -> Rc> { - Self::THREAD_CTX - .with(|ct| ct - .counts - .borrow_mut() - .pop() - .unwrap_or(Default::default())) + Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().pop().unwrap_or(Default::default())) } #[inline(never)] @@ -219,7 +214,7 @@ impl Counter { // First, count directly into the output buffer until we find a value that is out of order. for item in bucket { let b = item.get_level(level) as usize; - unsafe { *self.0.get_unchecked_mut(b*4) += 1 } + unsafe { *self.0.get_unchecked_mut(b * 4) += 1 } continue_from += 1; @@ -264,7 +259,10 @@ impl Counter { let a = i * 4; unsafe { - *counts.0.get_unchecked_mut(i) = *self.0.get_unchecked(a) + *self.0.get_unchecked(a + 1) + *self.0.get_unchecked(a + 2) + *self.0.get_unchecked(a + 3); + *counts.0.get_unchecked_mut(i) = *self.0.get_unchecked(a) + + *self.0.get_unchecked(a + 1) + + *self.0.get_unchecked(a + 2) + + *self.0.get_unchecked(a + 3); } } } diff --git a/src/lib.rs b/src/lib.rs index aae0f58..b056a99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -181,18 +181,16 @@ mod sorts; #[cfg(test)] pub mod sorts; -mod utils; -#[cfg(test)] -pub mod test_utils; mod radix_sort; mod sorter; +#[cfg(test)] +pub mod test_utils; mod tuners; - +mod utils; pub mod counts; pub mod tuner; - // Public exports pub use radix_key::RadixKey; pub use radix_sort::RadixSort; diff --git a/src/radix_sort.rs b/src/radix_sort.rs index 7091d84..e9f6ea4 100644 --- a/src/radix_sort.rs +++ b/src/radix_sort.rs @@ -46,8 +46,8 @@ where #[cfg(test)] mod tests { - use crate::tuner::{Algorithm, Tuner, TuningParams}; use crate::test_utils::{sort_comparison_suite, NumericTest, SingleAlgoTuner}; + use crate::tuner::{Algorithm, Tuner, TuningParams}; use crate::RadixSort; use block_pseudorand::block_rand; use std::cmp::Ordering; diff --git a/src/sorts/comparative_sort.rs b/src/sorts/comparative_sort.rs index 0d47d90..039e890 100644 --- a/src/sorts/comparative_sort.rs +++ b/src/sorts/comparative_sort.rs @@ -53,11 +53,11 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; use crate::RadixKey; fn test_comparative_sort_adapter(shift: T) diff --git a/src/sorts/lsb_sort.rs b/src/sorts/lsb_sort.rs index c12133b..4ee053b 100644 --- a/src/sorts/lsb_sort.rs +++ b/src/sorts/lsb_sort.rs @@ -193,11 +193,11 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; use crate::RadixKey; fn test_lsb_sort_adapter(shift: T) diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index 6239225..1db8d41 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -218,12 +218,12 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; - use crate::utils::cdiv; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; + use crate::utils::cdiv; use crate::RadixKey; use rayon::current_num_threads; diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index 21a7bf4..312971c 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -117,11 +117,11 @@ mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; use crate::utils::{aggregate_tile_counts, cdiv, get_tile_counts}; use crate::RadixKey; use rayon::current_num_threads; diff --git a/src/sorts/regions_sort.rs b/src/sorts/regions_sort.rs index e1401ec..687def5 100644 --- a/src/sorts/regions_sort.rs +++ b/src/sorts/regions_sort.rs @@ -311,11 +311,11 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; use crate::utils::{aggregate_tile_counts, cdiv, get_tile_counts}; use crate::RadixKey; use rayon::current_num_threads; diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index c7bd8a6..032fff8 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -275,11 +275,11 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; use crate::RadixKey; fn test_scanning_sort(shift: T) diff --git a/src/sorts/ska_sort.rs b/src/sorts/ska_sort.rs index fb59df5..ae1fd46 100644 --- a/src/sorts/ska_sort.rs +++ b/src/sorts/ska_sort.rs @@ -127,11 +127,11 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::tuner::Algorithm; - use crate::tuners::StandardTuner; use crate::test_utils::{ sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, }; + use crate::tuner::Algorithm; + use crate::tuners::StandardTuner; use crate::RadixKey; fn test_ska_sort_adapter(shift: T) diff --git a/src/test_utils.rs b/src/test_utils.rs index f7dc949..39de5aa 100644 --- a/src/test_utils.rs +++ b/src/test_utils.rs @@ -6,23 +6,7 @@ use std::fmt::Debug; use std::ops::{Shl, ShlAssign, Shr, ShrAssign}; pub trait NumericTest: -RadixKey -+ Sized -+ Copy -+ Debug -+ PartialEq -+ Ord -+ Send -+ Sync -+ Shl -+ Shr -+ ShrAssign -+ ShlAssign -{ -} - -impl NumericTest for T where - T: RadixKey + RadixKey + Sized + Copy + Debug @@ -37,6 +21,22 @@ impl NumericTest for T where { } +impl NumericTest for T where + T: RadixKey + + Sized + + Copy + + Debug + + PartialEq + + Ord + + Send + + Sync + + Shl + + Shr + + ShrAssign + + ShlAssign +{ +} + pub struct SingleAlgoTuner { pub(crate) algo: Algorithm, } From 22e7fe914d0440ad01a529ab08f84f39b83d660a Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:21:40 +0900 Subject: [PATCH 17/24] Apply clippy suggestions --- src/counts.rs | 4 ++-- src/sorter.rs | 4 ++-- src/sorts/mt_lsb_sort.rs | 5 ++--- src/sorts/recombinating_sort.rs | 6 +++--- src/sorts/regions_sort.rs | 24 ++++++++++++------------ src/sorts/scanning_sort.rs | 10 +++++----- src/utils.rs | 7 +------ 7 files changed, 27 insertions(+), 33 deletions(-) diff --git a/src/counts.rs b/src/counts.rs index dd05285..b15990b 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -75,7 +75,7 @@ impl CountManager { #[inline(never)] pub fn get_empty_counts(&self) -> Rc> { - Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().pop().unwrap_or(Default::default())) + Self::THREAD_CTX.with(|ct| ct.counts.borrow_mut().pop().unwrap_or_default()) } #[inline(never)] @@ -250,7 +250,7 @@ impl Counter { *self.0.get_unchecked_mut(d) += 1; }); - rem.into_iter().for_each(|v| unsafe { + rem.iter().for_each(|v| unsafe { let b = v.get_level(level) as usize * 4; *self.0.get_unchecked_mut(b) += 1; }); diff --git a/src/sorter.rs b/src/sorter.rs index 78671be..5a17e7a 100644 --- a/src/sorter.rs +++ b/src/sorter.rs @@ -107,7 +107,7 @@ impl<'a> Sorter<'a> { let use_tiles = cfg!(feature = "multi-threaded") && self.multi_threaded && chunk.len() >= 260_000; let tile_size = if use_tiles { - max(30_000, cdiv(chunk.len(), threads)) + max(30_000, chunk.len().div_ceil(threads)) } else { chunk.len() }; @@ -194,7 +194,7 @@ impl<'a> Sorter<'a> { let parent_len = Some(bucket.len()); let threads = current_num_threads(); - let segment_size = cdiv(bucket.len(), threads); + let segment_size = bucket.len().div_ceil(threads); let mut running_total = 0; let mut radix_start = 255; diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index 1db8d41..4c976c9 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -223,7 +223,6 @@ mod tests { }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::cdiv; use crate::RadixKey; use rayon::current_num_threads; @@ -237,7 +236,7 @@ mod tests { } let sorter = Sorter::new(true, &StandardTuner); - let tile_size = cdiv(inputs.len(), current_num_threads()); + let tile_size = inputs.len().div_ceil(current_num_threads()); sorter.mt_lsb_sort_adapter(inputs, 0, T::LEVELS - 1, tile_size); }); @@ -293,7 +292,7 @@ mod tests { } let sorter = Sorter::new(true, &StandardTuner); - let tile_size = cdiv(inputs.len(), current_num_threads()); + let tile_size = inputs.len().div_ceil(current_num_threads()); sorter.mt_lsb_sort_adapter(inputs, 0, u32::LEVELS - 1, tile_size); }); diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index 312971c..b7936cf 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -122,7 +122,7 @@ mod tests { }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::{aggregate_tile_counts, cdiv, get_tile_counts}; + use crate::utils::{aggregate_tile_counts, get_tile_counts}; use crate::RadixKey; use rayon::current_num_threads; @@ -132,7 +132,7 @@ mod tests { { sort_comparison_suite(shift, |inputs| { let level = T::LEVELS - 1; - let tile_size = cdiv(inputs.len(), current_num_threads()); + let tile_size = inputs.len().div_ceil(current_num_threads()); if inputs.len() == 0 { return; @@ -187,7 +187,7 @@ mod tests { pub fn test_u32_patterns() { validate_u32_patterns(|inputs| { let level = u32::LEVELS - 1; - let tile_size = cdiv(inputs.len(), current_num_threads()); + let tile_size = inputs.len().div_ceil(current_num_threads()); if inputs.len() == 0 { return; diff --git a/src/sorts/regions_sort.rs b/src/sorts/regions_sort.rs index 687def5..3b7aeea 100644 --- a/src/sorts/regions_sort.rs +++ b/src/sorts/regions_sort.rs @@ -10,15 +10,15 @@ //! 2. Compute counts for each bucket and sort each bucket in-place //! 3. Generate global counts //! 4. Generate Graph & Sort -//! 4.1 List outbound regions for each country -//! 4.2 For each country (C): -//! 4.2.1: List the inbounds for C (filter outbounds for each other country by destination: C) -//! 4.2.2: For each thread: -//! 4.2.2.1: Pop an item off the inbound (country: I) & outbound (country: O) queues for C -//! 4.2.2.2/a: If they are the same size, continue -//! 4.2.2.2/b: If I is bigger than O, keep the remainder of I in the queue and continue -//! 4.2.2.2/c: If O is bigger than I, keep the remainder of O in the queue and continue -//! 4.2.2.3: Swap items in C heading to O, with items in I destined for C (items in C may or may not be destined for O ultimately) +//! 4.1 List outbound regions for each country +//! 4.2 For each country (C): +//! 4.2.1: List the inbounds for C (filter outbounds for each other country by destination: C) +//! 4.2.2: For each thread: +//! 4.2.2.1: Pop an item off the inbound (country: I) & outbound (country: O) queues for C +//! 4.2.2.2/a: If they are the same size, continue +//! 4.2.2.2/b: If I is bigger than O, keep the remainder of I in the queue and continue +//! 4.2.2.2/c: If O is bigger than I, keep the remainder of O in the queue and continue +//! 4.2.2.3: Swap items in C heading to O, with items in I destined for C (items in C may or may not be destined for O ultimately) //! //! ## Characteristics //! @@ -316,7 +316,7 @@ mod tests { }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; - use crate::utils::{aggregate_tile_counts, cdiv, get_tile_counts}; + use crate::utils::{aggregate_tile_counts, get_tile_counts}; use crate::RadixKey; use rayon::current_num_threads; @@ -331,7 +331,7 @@ mod tests { return; } - let tile_size = cdiv(inputs.len(), current_num_threads()); + let tile_size = inputs.len().div_ceil(current_num_threads()); let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, T::LEVELS - 1); let counts = aggregate_tile_counts(&cm, &tile_counts); @@ -384,7 +384,7 @@ mod tests { let cm = CountManager::default(); let sorter = Sorter::new(true, &StandardTuner); - let tile_size = cdiv(inputs.len(), current_num_threads()); + let tile_size = inputs.len().div_ceil(current_num_threads()); let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, u32::LEVELS - 1); let counts = aggregate_tile_counts(&cm, &tile_counts); diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index 032fff8..38af5ed 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -6,11 +6,11 @@ //! 2. Create a worker for each rayon global thread pool thread (roughly, one per core) //! 2. Create a temporary thread-local buffer for each worker (one vec for each radix) //! 3. Each thread: -//! 3.1. Iterates over the buckets, trying to gain a mutex lock on one -//! 3.2. On first lock of the bucket, it partitions the bucket into [correct data | incorrect data] in-place -//! 3.3. Scan over the contents of the bucket, picking up data that shouldn't be there and putting it in the thread-local buffer -//! 3.4. Writes any buffered contents that _should_ be in this bucket, into the bucket -//! 3.5. Repeats 3 until all buckets are completely filled with the correct data +//! 3.1. Iterates over the buckets, trying to gain a mutex lock on one +//! 3.2. On first lock of the bucket, it partitions the bucket into [correct data | incorrect data] in-place +//! 3.3. Scan over the contents of the bucket, picking up data that shouldn't be there and putting it in the thread-local buffer +//! 3.4. Writes any buffered contents that _should_ be in this bucket, into the bucket +//! 3.5. Repeats 3 until all buckets are completely filled with the correct data //! //! Along the way, each output bucket has a read head and a write head, which is a pointer to the latest content read and written respectively. //! When the read head reaches the end of the bucket, there is no more content to be buffered by any worker. diff --git a/src/utils.rs b/src/utils.rs index ad3a2b3..ecfa60d 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -5,11 +5,6 @@ use rayon::prelude::*; use std::cell::RefCell; use std::rc::Rc; -#[inline] -pub const fn cdiv(a: usize, b: usize) -> usize { - (a + b - 1) / b -} - #[inline] pub fn get_tile_counts( cm: &CountManager, @@ -23,7 +18,7 @@ where #[cfg(feature = "work_profiles")] println!("({}) TILE_COUNT", level); - let num_tiles = cdiv(bucket.len(), tile_size); + let num_tiles = bucket.len().div_ceil(tile_size); let mut tiles: Vec = vec![Counts::default(); num_tiles]; let mut meta: Vec = vec![CountMeta::default(); num_tiles]; From 3a8061578e40238184bc2e93da0b59e07d01548d Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 21:23:24 +0900 Subject: [PATCH 18/24] Add nightly toolchain for script tests in CI --- .github/workflows/rust.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7f88b0c..98c13e4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -51,6 +51,12 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Install nightly toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true - name: Run profiling script run: ./scripts/profiling.rs @@ -58,8 +64,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + - name: Install nightly toolchain + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true - name: Run timings script - run: ./scripts/timings.rs + run: ./scripts/timings.rs 1234 "Hello world" test-examples: runs-on: ubuntu-latest From 04861c2d87f3c95d664af08bf4edfa071f1af9b5 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 22:13:29 +0900 Subject: [PATCH 19/24] Return counts even when already sorted --- src/counts.rs | 54 ++++++++++++++++++++++++++++++++- src/sorts/recombinating_sort.rs | 1 - 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/src/counts.rs b/src/counts.rs index b15990b..7afe17b 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -21,7 +21,7 @@ impl Default for Counter { } #[repr(C, align(2048))] -#[derive(Clone)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct Counts([usize; 256]); pub type PrefixSums = Counts; pub type EndOffsets = Counts; @@ -227,6 +227,9 @@ impl Counter { } if continue_from == bucket.len() { + for i in 0..256 { + counts[i] = unsafe { *self.0.get_unchecked_mut(i * 4) } + } return; } @@ -299,3 +302,52 @@ impl<'a> IntoIterator for &'a Counts { self.0.iter() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + pub fn test_counting() { + let count_manager = CountManager::default(); + + let data: [u16; 5] = [0x0000, 0x0101, 0x0200, 0x0200, 0xFFFF]; + let counts_lower = count_manager.counts(&data, 0); + let counts_upper = count_manager.counts(&data, 1); + let mut expected_lower = Counts::default(); + let mut expected_upper = Counts::default(); + expected_lower[0] = 3; + expected_lower[1] = 1; + expected_lower[255] = 1; + + expected_upper[0] = 1; + expected_upper[1] = 1; + expected_upper[2] = 2; + expected_upper[255] = 1; + + assert_eq!(counts_lower.0.take(), expected_lower); + assert_eq!(counts_upper.0.take(), expected_upper); + } + + #[test] + pub fn test_reuse() { + let count_manager = CountManager::default(); + + let data_1: [u16; 5] = [0x0000, 0x0101, 0x0200, 0x0200, 0xFFFF]; + let data_2: [u16; 5] = [0x0101, 0x0202, 0x0301, 0x0301, 0x0000]; + let counts_1 = count_manager.counts(&data_1, 0); + let counts_2 = count_manager.counts(&data_2, 0); + let mut expected_1 = Counts::default(); + let mut expected_2 = Counts::default(); + expected_1[0] = 3; + expected_1[1] = 1; + expected_1[255] = 1; + + expected_2[0] = 1; + expected_2[1] = 3; + expected_2[2] = 1; + + assert_eq!(counts_1.0.take(), expected_1); + assert_eq!(counts_2.0.take(), expected_2); + } +} \ No newline at end of file diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index b7936cf..a5e6da6 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -114,7 +114,6 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { - use crate::counts::CountManager; use crate::sorter::Sorter; use crate::test_utils::{ From c5d4ad759f342f8adb42095b8be009c706d123c1 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 22:29:08 +0900 Subject: [PATCH 20/24] Use single algo in test suites for each algorithm --- src/counts.rs | 2 +- src/sorts/comparative_sort.rs | 7 +++---- src/sorts/lsb_sort.rs | 11 ++++++----- src/sorts/mt_lsb_sort.rs | 28 ++++++++++++++++++++++++---- src/sorts/recombinating_sort.rs | 8 ++++---- src/sorts/regions_sort.rs | 9 +++++---- src/sorts/scanning_sort.rs | 8 ++++---- src/sorts/ska_sort.rs | 8 ++++---- 8 files changed, 51 insertions(+), 30 deletions(-) diff --git a/src/counts.rs b/src/counts.rs index 7afe17b..97e442d 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -149,7 +149,7 @@ impl CountManager { F: FnMut(&CountManager, &mut [T], &mut [T]), { Self::THREAD_CTX.with(|ct| { - let byte_len = std::mem::size_of_val(src_bucket); + let byte_len = size_of_val(src_bucket); let mut t = ct.tmp.borrow_mut(); if t.len() < byte_len { diff --git a/src/sorts/comparative_sort.rs b/src/sorts/comparative_sort.rs index 039e890..b58662e 100644 --- a/src/sorts/comparative_sort.rs +++ b/src/sorts/comparative_sort.rs @@ -53,9 +53,7 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -64,8 +62,9 @@ mod tests { where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::Comparative }; sort_comparison_suite(shift, |inputs| { - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); sorter.comparative_sort(inputs, T::LEVELS - 1); }); } diff --git a/src/sorts/lsb_sort.rs b/src/sorts/lsb_sort.rs index 4ee053b..6ceb552 100644 --- a/src/sorts/lsb_sort.rs +++ b/src/sorts/lsb_sort.rs @@ -193,9 +193,7 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -204,15 +202,18 @@ mod tests { where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::Lsb }; + let tuner_lsb = SingleAlgoTuner{ algo: Algorithm::LrLsb }; + sort_comparison_suite(shift, |inputs| { - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); let (counts, _) = sorter.cm.counts(inputs, T::LEVELS - 1); sorter.lsb_sort_adapter(false, inputs, counts, 0, T::LEVELS - 1) }); sort_comparison_suite(shift, |inputs| { - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner_lsb); let (counts, _) = sorter.cm.counts(inputs, T::LEVELS - 1); sorter.lsb_sort_adapter(true, inputs, counts, 0, T::LEVELS - 1); diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index 4c976c9..7f117b0 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -218,28 +218,48 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; use rayon::current_num_threads; + use crate::counts::CountManager; + use crate::utils::{aggregate_tile_counts, get_tile_counts}; fn test_mt_lsb_sort_adapter(shift: T) where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::MtLsb }; + let tuner_oop = SingleAlgoTuner{ algo: Algorithm::MtOop }; + sort_comparison_suite(shift, |inputs| { if inputs.len() == 0 { return; } - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); let tile_size = inputs.len().div_ceil(current_num_threads()); sorter.mt_lsb_sort_adapter(inputs, 0, T::LEVELS - 1, tile_size); }); + + sort_comparison_suite(shift, |inputs| { + let level = T::LEVELS - 1; + let tile_size = inputs.len().div_ceil(current_num_threads()); + + if inputs.len() == 0 { + return; + } + + let cm = CountManager::default(); + let sorter = Sorter::new(true, &tuner_oop); + + let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, level); + let counts = aggregate_tile_counts(&cm, &tile_counts); + + sorter.mt_oop_sort_adapter(inputs, T::LEVELS - 1, counts, tile_counts, tile_size); + }); } #[test] diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index a5e6da6..3d037f2 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -116,9 +116,7 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::utils::{aggregate_tile_counts, get_tile_counts}; @@ -129,6 +127,8 @@ mod tests { where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::Recombinating }; + sort_comparison_suite(shift, |inputs| { let level = T::LEVELS - 1; let tile_size = inputs.len().div_ceil(current_num_threads()); @@ -138,7 +138,7 @@ mod tests { } let cm = CountManager::default(); - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); let (tile_counts, _) = get_tile_counts(&cm, inputs, tile_size, level); let counts = aggregate_tile_counts(&cm, &tile_counts); diff --git a/src/sorts/regions_sort.rs b/src/sorts/regions_sort.rs index 3b7aeea..65ac5b4 100644 --- a/src/sorts/regions_sort.rs +++ b/src/sorts/regions_sort.rs @@ -311,9 +311,7 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::utils::{aggregate_tile_counts, get_tile_counts}; @@ -324,9 +322,12 @@ mod tests { where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::Regions }; + sort_comparison_suite(shift, |inputs| { let cm = CountManager::default(); - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); + if inputs.len() == 0 { return; } diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index 38af5ed..7ec2746 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -275,9 +275,7 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -286,9 +284,11 @@ mod tests { where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::Scanning }; + sort_comparison_suite(shift, |inputs| { let cm = CountManager::default(); - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); let (counts, _) = cm.counts(inputs, T::LEVELS - 1); sorter.scanning_sort_adapter(inputs, counts, T::LEVELS - 1) diff --git a/src/sorts/ska_sort.rs b/src/sorts/ska_sort.rs index ae1fd46..3d75c53 100644 --- a/src/sorts/ska_sort.rs +++ b/src/sorts/ska_sort.rs @@ -127,9 +127,7 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{ - sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, - }; + use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -138,8 +136,10 @@ mod tests { where T: NumericTest, { + let tuner = SingleAlgoTuner{ algo: Algorithm::Ska }; + sort_comparison_suite(shift, |inputs| { - let sorter = Sorter::new(true, &StandardTuner); + let sorter = Sorter::new(true, &tuner); let (counts, _) = sorter.cm.counts(inputs, T::LEVELS - 1); sorter.ska_sort_adapter(inputs, counts, T::LEVELS - 1); From ab6e218b2516f21f1f4d4b5e20adcd2effc46cb7 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 23:19:43 +0900 Subject: [PATCH 21/24] Add missing level check to mt-oop --- src/sorts/mt_lsb_sort.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index 7f117b0..666114e 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -211,6 +211,10 @@ impl<'a> Sorter<'a> { }); }); + if level == 0 { + return; + } + self.director(bucket, counts, level - 1); } } From 2c15b87abab7b9b9d99d118e6b5cf5b939e07739 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 23:19:43 +0900 Subject: [PATCH 22/24] Use opt-level 2 for tests --- Cargo.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index d95a1e6..563f28b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,10 @@ criterion = { version = "0.5.1", default-features=false, features = ["rayon", "c codegen-units = 1 opt-level = 3 +[profile.test] +opt-level = 2 +debug = true + [[bench]] name = "basic_sort" harness = false From 48ddb3e3c208b57395ff8a5518c266a0df2c3ded Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 23:19:43 +0900 Subject: [PATCH 23/24] Run cargo fmt --- src/sorts/comparative_sort.rs | 9 +++++++-- src/sorts/lsb_sort.rs | 13 ++++++++++--- src/sorts/mt_lsb_sort.rs | 17 ++++++++++++----- src/sorts/recombinating_sort.rs | 9 +++++++-- src/sorts/regions_sort.rs | 9 +++++++-- src/sorts/scanning_sort.rs | 9 +++++++-- src/sorts/ska_sort.rs | 9 +++++++-- 7 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/sorts/comparative_sort.rs b/src/sorts/comparative_sort.rs index b58662e..f9a92e6 100644 --- a/src/sorts/comparative_sort.rs +++ b/src/sorts/comparative_sort.rs @@ -53,7 +53,10 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -62,7 +65,9 @@ mod tests { where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::Comparative }; + let tuner = SingleAlgoTuner { + algo: Algorithm::Comparative, + }; sort_comparison_suite(shift, |inputs| { let sorter = Sorter::new(true, &tuner); sorter.comparative_sort(inputs, T::LEVELS - 1); diff --git a/src/sorts/lsb_sort.rs b/src/sorts/lsb_sort.rs index 6ceb552..3cf3d46 100644 --- a/src/sorts/lsb_sort.rs +++ b/src/sorts/lsb_sort.rs @@ -193,7 +193,10 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -202,8 +205,12 @@ mod tests { where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::Lsb }; - let tuner_lsb = SingleAlgoTuner{ algo: Algorithm::LrLsb }; + let tuner = SingleAlgoTuner { + algo: Algorithm::Lsb, + }; + let tuner_lsb = SingleAlgoTuner { + algo: Algorithm::LrLsb, + }; sort_comparison_suite(shift, |inputs| { let sorter = Sorter::new(true, &tuner); diff --git a/src/sorts/mt_lsb_sort.rs b/src/sorts/mt_lsb_sort.rs index 666114e..97e119a 100644 --- a/src/sorts/mt_lsb_sort.rs +++ b/src/sorts/mt_lsb_sort.rs @@ -221,21 +221,28 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { + use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; + use crate::utils::{aggregate_tile_counts, get_tile_counts}; use crate::RadixKey; use rayon::current_num_threads; - use crate::counts::CountManager; - use crate::utils::{aggregate_tile_counts, get_tile_counts}; fn test_mt_lsb_sort_adapter(shift: T) where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::MtLsb }; - let tuner_oop = SingleAlgoTuner{ algo: Algorithm::MtOop }; + let tuner = SingleAlgoTuner { + algo: Algorithm::MtLsb, + }; + let tuner_oop = SingleAlgoTuner { + algo: Algorithm::MtOop, + }; sort_comparison_suite(shift, |inputs| { if inputs.len() == 0 { diff --git a/src/sorts/recombinating_sort.rs b/src/sorts/recombinating_sort.rs index 3d037f2..4a82256 100644 --- a/src/sorts/recombinating_sort.rs +++ b/src/sorts/recombinating_sort.rs @@ -116,7 +116,10 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::utils::{aggregate_tile_counts, get_tile_counts}; @@ -127,7 +130,9 @@ mod tests { where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::Recombinating }; + let tuner = SingleAlgoTuner { + algo: Algorithm::Recombinating, + }; sort_comparison_suite(shift, |inputs| { let level = T::LEVELS - 1; diff --git a/src/sorts/regions_sort.rs b/src/sorts/regions_sort.rs index 65ac5b4..18c2996 100644 --- a/src/sorts/regions_sort.rs +++ b/src/sorts/regions_sort.rs @@ -311,7 +311,10 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::utils::{aggregate_tile_counts, get_tile_counts}; @@ -322,7 +325,9 @@ mod tests { where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::Regions }; + let tuner = SingleAlgoTuner { + algo: Algorithm::Regions, + }; sort_comparison_suite(shift, |inputs| { let cm = CountManager::default(); diff --git a/src/sorts/scanning_sort.rs b/src/sorts/scanning_sort.rs index 7ec2746..611835f 100644 --- a/src/sorts/scanning_sort.rs +++ b/src/sorts/scanning_sort.rs @@ -275,7 +275,10 @@ impl<'a> Sorter<'a> { mod tests { use crate::counts::CountManager; use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -284,7 +287,9 @@ mod tests { where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::Scanning }; + let tuner = SingleAlgoTuner { + algo: Algorithm::Scanning, + }; sort_comparison_suite(shift, |inputs| { let cm = CountManager::default(); diff --git a/src/sorts/ska_sort.rs b/src/sorts/ska_sort.rs index 3d75c53..c104003 100644 --- a/src/sorts/ska_sort.rs +++ b/src/sorts/ska_sort.rs @@ -127,7 +127,10 @@ impl<'a> Sorter<'a> { #[cfg(test)] mod tests { use crate::sorter::Sorter; - use crate::test_utils::{sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, SingleAlgoTuner}; + use crate::test_utils::{ + sort_comparison_suite, sort_single_algorithm, validate_u32_patterns, NumericTest, + SingleAlgoTuner, + }; use crate::tuner::Algorithm; use crate::tuners::StandardTuner; use crate::RadixKey; @@ -136,7 +139,9 @@ mod tests { where T: NumericTest, { - let tuner = SingleAlgoTuner{ algo: Algorithm::Ska }; + let tuner = SingleAlgoTuner { + algo: Algorithm::Ska, + }; sort_comparison_suite(shift, |inputs| { let sorter = Sorter::new(true, &tuner); From 5cfdc6193664ce5c3c0fcc9543d35d8e35b7e8e2 Mon Sep 17 00:00:00 2001 From: Nathan Essex Date: Sat, 8 Feb 2025 23:19:43 +0900 Subject: [PATCH 24/24] Account for threads being interrupted while holding shared tmp buffers --- src/counts.rs | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/counts.rs b/src/counts.rs index 97e442d..50fd4f2 100644 --- a/src/counts.rs +++ b/src/counts.rs @@ -150,11 +150,22 @@ impl CountManager { { Self::THREAD_CTX.with(|ct| { let byte_len = size_of_val(src_bucket); - let mut t = ct.tmp.borrow_mut(); - - if t.len() < byte_len { - *t = Vec::with_capacity(byte_len); - } + let thread_tmp = ct.tmp.try_borrow_mut(); + let one_off_tmp: RefCell>; + + let mut t = match thread_tmp { + Ok(mut t) => { + if t.len() < byte_len { + *t = Vec::with_capacity(byte_len); + } + + t + } + Err(_) => { + one_off_tmp = RefCell::new(Vec::with_capacity(byte_len)); + one_off_tmp.borrow_mut() + } + }; // Safety: The buffer is guaranteed to have enough capacity by the logic above. // As the data is copied from the source buffer to the temporary buffer, and @@ -350,4 +361,4 @@ mod tests { assert_eq!(counts_1.0.take(), expected_1); assert_eq!(counts_2.0.take(), expected_2); } -} \ No newline at end of file +}