From 1f0515c6c32bd9dfba0ca1db7d56d66b00219327 Mon Sep 17 00:00:00 2001 From: Spencer Baugh Date: Wed, 22 Sep 2021 21:10:53 -0400 Subject: [PATCH] initial exported version of iqueue --- .envrc | 1 + .gitignore | 41 + LICENSE | 13 + Makefile.am | 90 ++ README | 14 + configure.ac | 17 + default.nix | 7 + include/iqsync.h | 92 +- include/iqueue.h | 118 +- include/iqueue.hh | 471 ++++++++ include/shash.h | 15 + include/stringer.hh | 31 + iqueue.pc.in | 10 + src/container_of.h | 37 + src/dump_iqueue.cc | 64 + src/getlogstr.c | 208 ++++ src/in2iqueue.cc | 88 ++ src/io_utils.c | 107 ++ src/io_utils.h | 61 + src/iqmod_common.h | 67 ++ src/iqmod_copy-main.c | 470 ++++++++ src/iqmod_inplace-main.c | 148 +++ src/iqsync-main.c | 711 +++++++---- src/iqsync.c | 1776 +++++++++++++++++----------- src/iqueue-main.c | 724 ++++++------ src/iqueue.c | 1494 ++++++++++++----------- src/iqueue2out.cc | 120 ++ src/iqueue_tail_count.cc | 62 + src/math_utils.c | 37 + src/math_utils.h | 19 + src/net_utils.c | 146 +++ src/net_utils.h | 38 + src/proc_utils.c | 105 ++ src/proc_utils.h | 39 + src/shash.c | 23 +- src/try_unix.hh | 50 + src/tsassert.c | 65 + src/tsassert.h | 62 + src/tsclock.h | 36 + src/tsdir.c | 127 ++ src/tsdir.h | 50 + src/tsflexhash.c | 299 +++++ src/tsflexhash.h | 823 +++++++++++++ src/tsflexhash_private.h | 50 + src/tsgosmacs.h | 45 + src/tslock.h | 387 ++++++ src/tslog.c | 592 ++++++++++ src/tslog.h | 287 +++++ src/tstl.c | 166 +++ src/tstl.h | 144 +++ src/twosigma.h | 185 +++ src/wait_for_heartbeat.cc | 86 ++ test/copyout.test | 143 +++ test/ctest.h | 156 +++ test/ctest_main.h | 305 +++++ test/ctest_resource.c | 161 +++ test/ctest_resource.h | 43 + test/grow_ctest.c | 93 ++ test/heartbeat-writeback-test.c | 78 ++ test/iqmod_ctest.c | 87 ++ test/iqsync-bidirectional.test | 160 +++ test/iqsync-buffer.test | 204 ++++ test/iqsync-cascade.test | 179 +++ test/iqsync-filter.test | 218 ++++ test/iqsync-latency-test.c | 124 ++ test/iqsync-multi.test | 143 +++ test/iqsync-pingpong-test.c | 185 +++ test/iqsync-pushpull.test | 172 +++ test/iqsync-scan.test | 224 ++++ test/iqsync-tcp.test | 234 ++++ test/iqsync-verify.test | 224 ++++ test/iqueue-big-test.c | 420 +++++++ test/iqueue-contention-test.c | 194 +++ test/iqueue-dc-test.c | 154 +++ test/iqueue-latency-test.c | 206 ++++ test/iqueue-overhead-test.c | 165 +++ test/iqueue-read-contention-test.c | 230 ++++ test/iqueue-seal-test | 112 ++ test/iqueue-seal-test.test | 110 ++ test/iqueue_allocator_ctest.c | 182 +++ test/iqueue_reopen_ctest.c | 114 ++ test/iqueue_symlink_ctest.c | 87 ++ test/iqueue_try_update_ctest.c | 104 ++ test/iqueue_writer_ctest.c | 63 + test/nop-ssh | 18 + test/shash/shash_ctest.c | 184 +++ test/unlink_ctest.c | 71 ++ 87 files changed, 14436 insertions(+), 2029 deletions(-) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile.am create mode 100644 README create mode 100644 configure.ac create mode 100644 default.nix create mode 100644 include/iqueue.hh create mode 100644 include/stringer.hh create mode 100644 iqueue.pc.in create mode 100644 src/container_of.h create mode 100644 src/dump_iqueue.cc create mode 100644 src/getlogstr.c create mode 100644 src/in2iqueue.cc create mode 100644 src/io_utils.c create mode 100644 src/io_utils.h create mode 100644 src/iqmod_common.h create mode 100644 src/iqmod_copy-main.c create mode 100644 src/iqmod_inplace-main.c create mode 100644 src/iqueue2out.cc create mode 100644 src/iqueue_tail_count.cc create mode 100644 src/math_utils.c create mode 100644 src/math_utils.h create mode 100644 src/net_utils.c create mode 100644 src/net_utils.h create mode 100644 src/proc_utils.c create mode 100644 src/proc_utils.h create mode 100644 src/try_unix.hh create mode 100644 src/tsassert.c create mode 100644 src/tsassert.h create mode 100644 src/tsclock.h create mode 100644 src/tsdir.c create mode 100644 src/tsdir.h create mode 100644 src/tsflexhash.c create mode 100644 src/tsflexhash.h create mode 100644 src/tsflexhash_private.h create mode 100644 src/tsgosmacs.h create mode 100644 src/tslock.h create mode 100644 src/tslog.c create mode 100644 src/tslog.h create mode 100644 src/tstl.c create mode 100644 src/tstl.h create mode 100644 src/twosigma.h create mode 100644 src/wait_for_heartbeat.cc create mode 100755 test/copyout.test create mode 100644 test/ctest.h create mode 100644 test/ctest_main.h create mode 100644 test/ctest_resource.c create mode 100644 test/ctest_resource.h create mode 100644 test/grow_ctest.c create mode 100644 test/heartbeat-writeback-test.c create mode 100644 test/iqmod_ctest.c create mode 100755 test/iqsync-bidirectional.test create mode 100755 test/iqsync-buffer.test create mode 100755 test/iqsync-cascade.test create mode 100755 test/iqsync-filter.test create mode 100644 test/iqsync-latency-test.c create mode 100755 test/iqsync-multi.test create mode 100644 test/iqsync-pingpong-test.c create mode 100755 test/iqsync-pushpull.test create mode 100755 test/iqsync-scan.test create mode 100755 test/iqsync-tcp.test create mode 100755 test/iqsync-verify.test create mode 100644 test/iqueue-big-test.c create mode 100644 test/iqueue-contention-test.c create mode 100644 test/iqueue-dc-test.c create mode 100644 test/iqueue-latency-test.c create mode 100644 test/iqueue-overhead-test.c create mode 100644 test/iqueue-read-contention-test.c create mode 100644 test/iqueue-seal-test create mode 100755 test/iqueue-seal-test.test create mode 100644 test/iqueue_allocator_ctest.c create mode 100644 test/iqueue_reopen_ctest.c create mode 100644 test/iqueue_symlink_ctest.c create mode 100644 test/iqueue_try_update_ctest.c create mode 100644 test/iqueue_writer_ctest.c create mode 100755 test/nop-ssh create mode 100644 test/shash/shash_ctest.c create mode 100644 test/unlink_ctest.c diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..4a4726a --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use_nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..115ec8b --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +iqueue +*.Po +*.o +*.lo +*.la +config.h.in +Makefile.in +Makefile +src/iqsync +src/iqueue +src/libiqueue.a +/aclocal.m4 +/ar-lib +/autom4te.cache/ +/compile +/config.h +/config.status +/configure +/depcomp +/install-sh +/missing +/stamp-h1 +/build-aux +/m4 +/.libs +/libtool +iqsync +/iqueue.pc +.deps +.dirstamp +/iqmod_copy +/iqmod_inplace +*.log +*.a +*.trs +/*ctest +/dump_iqueue +/in2iqueue +/iqueue2out +/wait_for_heartbeat +/iqueue_tail_count diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7aaa10a --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +Copyright 2021 Two Sigma Open Source, LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..249617f --- /dev/null +++ b/Makefile.am @@ -0,0 +1,90 @@ +## Process this file with automake to produce Makefile.in +AM_CFLAGS = -g -Og -Wall -Wextra -Werror -pthread +AM_CXXFLAGS = $(AM_CFLAGS) -std=gnu++17 +AM_CPPFLAGS = -I$(srcdir)/include -I$(srcdir)/src + +# Library +pkgconfig_DATA = iqueue.pc +lib_LTLIBRARIES = libiqueue.la + +libiqueue_la_SOURCES = src/iqueue.c src/iqsync.c \ + src/math_utils.h src/math_utils.c \ + src/net_utils.h src/net_utils.c \ + src/proc_utils.h src/proc_utils.c \ + src/io_utils.h src/io_utils.c \ + src/shash.c \ + src/tsgosmacs.h src/tsflexhash.h src/tsflexhash_private.h src/tsflexhash.c \ + src/tslog.h src/tslog.c \ + src/tsassert.h src/tsassert.c \ + src/twosigma.h \ + src/tsclock.h \ + src/tstl.h src/tsdir.h \ + src/try_unix.hh \ + src/tstl.c src/tsdir.c src/getlogstr.c \ + src/container_of.h \ + src/tslock.h src/iqmod_common.h +include_HEADERS = include/iqueue.h include/iqsync.h include/shash.h include/iqueue.hh include/stringer.hh + +AM_LDFLAGS = -lbsd +# Programs +bin_PROGRAMS = iqueue iqsync wait_for_heartbeat iqueue_tail_count iqmod_copy iqmod_inplace in2iqueue iqueue2out dump_iqueue + +iqueue_SOURCES = src/iqueue-main.c +iqueue_LDADD = libiqueue.la + +iqsync_SOURCES = src/iqsync-main.c +iqsync_LDADD = libiqueue.la -ldl + +wait_for_heartbeat_SOURCES = src/wait_for_heartbeat.cc +wait_for_heartbeat_LDADD = libiqueue.la + +iqueue_tail_count_SOURCES = src/iqueue_tail_count.cc +iqueue_tail_count_LDADD = libiqueue.la + +iqmod_copy_SOURCES = src/iqmod_copy-main.c +iqmod_copy_LDADD = libiqueue.la + +iqmod_inplace_SOURCES = src/iqmod_inplace-main.c +iqmod_inplace_LDADD = libiqueue.la + +in2iqueue_SOURCES = src/in2iqueue.cc +in2iqueue_LDADD = libiqueue.la + +iqueue2out_SOURCES = src/iqueue2out.cc +iqueue2out_LDADD = libiqueue.la + +dump_iqueue_SOURCES = src/dump_iqueue.cc +dump_iqueue_LDADD = libiqueue.la + +# Tests +check_LIBRARIES = libtstest.a +libtstest_a_SOURCES = test/ctest.h test/ctest_main.h test/ctest_resource.c test/ctest_resource.h + +check_PROGRAMS = iqueue_reopen_ctest grow_ctest iqueue_try_update_ctest iqueue_writer_ctest \ + iqueue_symlink_ctest iqueue_allocator_ctest unlink_ctest iqmod_ctest +iqueue_reopen_ctest_SOURCES = test/iqueue_reopen_ctest.c +iqueue_reopen_ctest_LDADD = libiqueue.la libtstest.a + +grow_ctest_SOURCES = test/grow_ctest.c +grow_ctest_LDADD = libiqueue.la libtstest.a + +iqueue_try_update_ctest_SOURCES = test/iqueue_try_update_ctest.c +iqueue_try_update_ctest_LDADD = libiqueue.la libtstest.a + +iqueue_writer_ctest_SOURCES = test/iqueue_writer_ctest.c +iqueue_writer_ctest_LDADD = libiqueue.la libtstest.a + +iqueue_symlink_ctest_SOURCES = test/iqueue_symlink_ctest.c +iqueue_symlink_ctest_LDADD = libiqueue.la libtstest.a + +# Make sure TMPDIR is set to a place with lots of space for this one +iqueue_allocator_ctest_SOURCES = test/iqueue_allocator_ctest.c +iqueue_allocator_ctest_LDADD = libiqueue.la libtstest.a + +unlink_ctest_SOURCES = test/unlink_ctest.c +unlink_ctest_LDADD = libiqueue.la libtstest.a + +iqmod_ctest_SOURCES = test/iqmod_ctest.c +iqmod_ctest_LDADD = libiqueue.la libtstest.a + +TESTS = $(check_PROGRAMS) diff --git a/README b/README new file mode 100644 index 0000000..e5e8f01 --- /dev/null +++ b/README @@ -0,0 +1,14 @@ +Indexed queue. + +A persistent, connectionless, message-based transport. + +- Stored in a single file +- No additional processes +- No setup required beyond opening the file and beginning to read or write +- Multi-reader, each reader gets every message +- Multi-writer, writes are atomic and persistent + + +NOTE: + +iqueue fails with a SIGBUS when it runs out of disk-space. diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..2c581fd --- /dev/null +++ b/configure.ac @@ -0,0 +1,17 @@ +AC_INIT([iqueue], [0.1.0], [sbaugh@twosigma.com]) +AC_CONFIG_AUX_DIR([build-aux]) +AC_CONFIG_MACRO_DIRS([m4]) +AM_INIT_AUTOMAKE([-Wall -Werror foreign subdir-objects]) +AC_PROG_CC +AC_PROG_CXX +AM_PROG_AR +LT_INIT +dnl workaround for https://github.com/kimwalisch/primesieve/issues/16 +AC_SUBST(AR_FLAGS, [cr]) +PKG_INSTALLDIR +AC_CONFIG_HEADERS([config.h]) +AC_CONFIG_FILES([ + Makefile + iqueue.pc +]) +AC_OUTPUT diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..97e0f9b --- /dev/null +++ b/default.nix @@ -0,0 +1,7 @@ +with import {}; + +stdenv.mkDerivation rec { + name = "iqueue"; + src = ./.; + buildInputs = [ microsoft_gsl autoconf automake libtool pkgconfig libbsd autoreconfHook ]; +} diff --git a/include/iqsync.h b/include/iqsync.h index bea5c71..2ee8052 100644 --- a/include/iqsync.h +++ b/include/iqsync.h @@ -1,18 +1,29 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #ifndef _dma_transport_iqsync_h_ #define _dma_transport_iqsync_h_ /** \file * iqsync magic constants for both the ssh and udp versions. */ -#include "twosigma.h" #include #include #include #include "iqueue.h" -#include "tslock.h" - - -#define IQSYNC_HANDSHAKE_MAGIC 0x495148414E440005 +#define IQSYNC_HANDSHAKE_MAGIC 0x495148414E440005 /** Send at the start of TCP connection or every few seconds by * the multicast version. @@ -86,17 +97,46 @@ typedef struct } iqsync_shadow_t; +/** + * iqsync filtering function, which will be called on every outbound + * local iqueue message before it is sent to the remote side (push) + * Valid return codes are: + * 1: Forward the message + * 0: Skip the message + * -1: Stop processing futher messages and exit + */ +typedef int (*iqsync_filter_fn_t)(void *handle, const void *buf, size_t len); +typedef int (*iqsync_filter_setup_fn_t)( + const void *buf, + size_t len, + void **filter_fn_priv, + iqsync_filter_fn_t *filter_fn); + +typedef struct +{ + iqsync_filter_setup_fn_t filter_setup; + iqsync_filter_fn_t filter_fn; + + void *filter_fn_priv; +} iqsync_filter_t; + /** Book keeping and options for the iqsync process. * This is not the easiest structure to use for outside processes; * it will likely have some significant rework if any other * applications want to use the iqsync algorithm. */ +#define DEFAULT_RECVBUFFER_SIZE (1 << 20) // 1MB +#define DEFAULT_SENDBUFFER_SIZE (1 << 12) // 4KB + +struct extremely_dangerous_internal_tslock_s; +typedef struct extremely_dangerous_internal_tslock_s tslock_t; typedef struct { int read_fd; int write_fd; int do_clone; + int do_clone_push; int do_tail; int do_push; int do_pull; @@ -104,12 +144,17 @@ typedef struct int do_server; int do_prefetch; int do_syncbehind; + int use_sendbuffer; + int use_recvbuffer; + volatile int do_shutdown; int usleep_time; int verbose; + int quiet; uint64_t report_interval; uint64_t rate_limit; // in MB/s uint64_t avg_msg_len; // in bytes + uint64_t connection_timeout_sec; iqueue_t * iq; bool close_iq_on_shutdown; @@ -139,11 +184,48 @@ typedef struct pthread_t prefetch_thread; pthread_t syncbehind_thread; + pthread_mutex_t stats_shutdown_mutex; + pthread_cond_t stats_shutdown_cond; + iqsync_shadow_t remote; iqsync_shadow_t local; + + unsigned filter_count; + iqsync_filter_t *filters; + + int64_t initialization_rc; + int wait_complete; + + uint32_t recvbuffer_len; + uint8_t recvbuffer_block_shift; + uint64_t recvbuffer_offset_mask; + uint32_t sendbuffer_len; + uint8_t sendbuffer_block_shift; + uint64_t sendbuffer_offset_mask; + + /** + * internal buffer used to store packets + */ + volatile uint64_t recvbuffer_read_idx; + volatile uint64_t recvbuffer_write_idx; + char *recvbuffer; + + uint32_t sendbuffer_data_len; + char *sendbuffer; + } iqsync_t; +int +iqsync_start_async( + iqsync_t * iqsync +); + +int +iqsync_start_async_wait( + iqsync_t * iqsync +); + int iqsync_start( iqsync_t * iqsync diff --git a/include/iqueue.h b/include/iqueue.h index 8e8515f..e0ce918 100644 --- a/include/iqueue.h +++ b/include/iqueue.h @@ -1,24 +1,28 @@ -/* $TwoSigma: iqueue.h,v 1.21 2012/02/07 13:37:46 thudson Exp $ */ - /* - * Copyright (c) 2010 Two Sigma Investments, LLC - * All Rights Reserved + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF - * Two Sigma Investments, LLC. + * http://www.apache.org/licenses/LICENSE-2.0 * - * The copyright notice above does not evidence any - * actual or intended publication of such source code. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #ifndef _iqueue_h_ #define _iqueue_h_ -#include "twosigma.h" +#include #include #include +#include #include -#include "bswap.h" +#include #include "shash.h" __BEGIN_DECLS @@ -49,9 +53,9 @@ typedef struct { } iqueue_msg_t; #define IQUEUE_MSG_SEALED ((uint64_t)-1) -#define IQUEUE_MSG_BITS 44 -#define IQUEUE_MSG_MASK ((((uint64_t) 1) << IQUEUE_MSG_BITS) - 1) -#define IQUEUE_MSG_MAX ((((uint64_t) 1) << (64 - IQUEUE_MSG_BITS)) - 1) +#define IQUEUE_MSG_BITS 44 +#define IQUEUE_MSG_MASK ((((uint64_t) 1) << IQUEUE_MSG_BITS) - 1) +#define IQUEUE_MSG_MAX ((((uint64_t) 1) << (64 - IQUEUE_MSG_BITS)) - 1) #define IQUEUE_MSG_BAD_ID ((uint64_t) -1) @@ -61,7 +65,7 @@ iqueue_msg( uint64_t len ) { return (iqueue_msg_t) { - (len << IQUEUE_MSG_BITS) | (offset & IQUEUE_MSG_MASK) + (len << IQUEUE_MSG_BITS) | (offset & IQUEUE_MSG_MASK) }; } @@ -96,6 +100,19 @@ iqueue_open( ); +/** Update an iqueue's creation_time + * + * @param creation_time: the new creation_time + * + * @return -1 if there are any errors. + */ +int +iqueue_update_creation_time( + iqueue_t * iq, + uint64_t creation_time +); + + /** Create or open an iqueue_t object backed by a file. * * Atomically creates an iqueue file on disk, or returns the existing @@ -233,29 +250,6 @@ iqueue_name( const iqueue_t * iqueue ); - -typedef enum { - IQUEUE_MADV_WILLNEED, - IQUEUE_MADV_DONTNEED, -} iqueue_madvise_t; - -/** Advise the kernel about regions of the iqueue. - * @param start and end define the indices that are of interest. - * - * \note Since the iqueue data segment might not - * be contiguous and in order, it is possible for the portion of - * the file defined by start to be earlier than the region - * defined by end. No madvise request will be made in that case. - */ -int -iqueue_madavise( - iqueue_t * iqueue, - iqueue_madvise_t advice, - iqueue_id_t start, - iqueue_id_t end -); - - /** Retrieve the shared hash of writers. * * The iqueue maintains a list of entries of "writers" that can @@ -304,9 +298,9 @@ iqueue_writer_update( -#define IQUEUE_BLOCK_SHIFT 30 -#define IQUEUE_BLOCK_SIZE ((uint64_t) (1 << IQUEUE_BLOCK_SHIFT)) -#define IQUEUE_BLOCK_MASK (IQUEUE_BLOCK_SIZE - 1) +#define IQUEUE_BLOCK_SHIFT 30 +#define IQUEUE_BLOCK_SIZE ((uint64_t) (1 << IQUEUE_BLOCK_SHIFT)) +#define IQUEUE_BLOCK_MASK (IQUEUE_BLOCK_SIZE - 1) /** Returns the id of the first entry in the iqueue */ @@ -392,8 +386,8 @@ iqueue_data( ) { uint64_t offset = iqueue_offset(iq, id, size_out); - if (unlikely(offset == (uint64_t) -1)) - return NULL; + if (offset == (uint64_t) -1) + return NULL; return iqueue_get_data(iq, offset, 1); } @@ -470,27 +464,27 @@ iqueue_allocate( iqueue_msg_t * msg_out ) { - if (unlikely(len > allocator->bulk_len)) - return NULL; + if (len > allocator->bulk_len) + return NULL; while (1) { - uint64_t base = allocator->base_offset + allocator->offset; - uint64_t aligned = (base + allocator->align_mask) - & ~allocator->align_mask; - uint64_t offset = aligned + len - allocator->base_offset; - - if (likely(offset <= allocator->bulk_len)) - { - allocator->offset = offset; - *msg_out = iqueue_msg(aligned, len); - return allocator->base + aligned - allocator->base_offset; - } - - // It didn't fit; try to get more space - if (!allocator->auto_refill - || iqueue_allocator_refill(allocator) < 0) - return NULL; + uint64_t base = allocator->base_offset + allocator->offset; + uint64_t aligned = (base + allocator->align_mask) + & ~allocator->align_mask; + uint64_t offset = aligned + len - allocator->base_offset; + + if (offset <= allocator->bulk_len) + { + allocator->offset = offset; + *msg_out = iqueue_msg(aligned, len); + return allocator->base + aligned - allocator->base_offset; + } + + // It didn't fit; try to get more space + if (!allocator->auto_refill + || iqueue_allocator_refill(allocator) < 0) + return NULL; } } @@ -567,12 +561,12 @@ iqueue_append( ) { if (len >= IQUEUE_MSG_MAX) - return IQUEUE_STATUS_INVALID_ARGUMENT; + return IQUEUE_STATUS_INVALID_ARGUMENT; iqueue_msg_t iqmsg; void * const msg = iqueue_allocate_raw(iq, len, &iqmsg); if (!msg) - return IQUEUE_STATUS_NO_SPACE; + return IQUEUE_STATUS_NO_SPACE; memcpy(msg, buf, len); int rc = iqueue_update(iq, iqmsg, NULL); diff --git a/include/iqueue.hh b/include/iqueue.hh new file mode 100644 index 0000000..f0d8310 --- /dev/null +++ b/include/iqueue.hh @@ -0,0 +1,471 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "stringer.hh" +#include "shash.h" +#include "iqueue.h" +#include +#include +#include +#include +#include + +namespace ts { namespace mmia { namespace cpputils { + +enum class access_mode { read_only, write }; + +// Provides an iterator for read-only iqueue iteration. +struct iqueue_const_iterator +{ + using self_type = iqueue_const_iterator; + using value_type = gsl::span; + using difference_type = std::int64_t; + using pointer = value_type const *; + using reference = value_type const&; + using iterator_category = std::random_access_iterator_tag; + + iqueue_const_iterator() = default; + + iqueue_const_iterator(self_type const& rhs) = default; + + self_type& operator=(self_type const& rhs) = default; + + friend bool operator == (self_type const& lhs, self_type const& rhs) { + return lhs._id == rhs._id && lhs._iqueue == rhs._iqueue; + } + friend bool operator != (self_type const& lhs, self_type const& rhs) { + return !(lhs == rhs); + } + + self_type& operator++() { + ++_id; + return *this; + } + self_type operator++(int) { return self_type(_iqueue, _id++); } + + friend self_type operator+(self_type const &lhs, difference_type n) { + return self_type(lhs._iqueue, lhs._id+n); + } + friend difference_type operator-(self_type const &lhs, self_type const& rhs) { + return std::int64_t(lhs._id) - std::int64_t(rhs._id); + } + + reference operator*() const { + std::size_t size = 0; + void const* ptr = iqueue_data(_iqueue, _id, &size); + _current = {static_cast(ptr), static_cast(size)}; + return _current; + } + + pointer operator->() const { + return &(this->operator*()); + } + + iqueue_id_t id() const {return _id;} + +private: + + friend struct iqueue; + iqueue_const_iterator(iqueue_t* iq, iqueue_id_t id) + : _iqueue(iq) + , _id(id) { + } + + iqueue_t* _iqueue = nullptr; + iqueue_id_t _id = 0; + mutable value_type _current{}; +}; + +// Provides a type-safe iqueue shash entry with convenient 'update' method. +template +struct shash_entry +{ + using self_type = shash_entry; + using key_type = uint64_t; + using value_type = ValueType; + + shash_entry(shash_t* table, shash_entry_t* entry) : _table(table), _entry(entry) {} + shash_entry() = delete; + shash_entry(self_type const &rhs) = default; + self_type &operator = (self_type const &rhs) = default; + + void update(value_type value) { iqueue_writer_update(_table, _entry, static_cast(value)); } + key_type key() const { return _entry->key; } + value_type value() const { return value_type{_entry->value}; } + friend bool operator == (self_type const &lhs, self_type const &rhs) { + return lhs._table == rhs._table && lhs._entry == rhs._entry; + } + friend bool operator != (self_type const &lhs, self_type const &rhs) { + return lhs._table != rhs._table || lhs._entry != rhs._entry; + } + shash_t *table() const {return _table;} + explicit operator shash_entry_t *() const {return _entry;} + self_type& operator ++() {++_entry; return *this;} + self_type operator++(int) {return self_type(_table, _entry++);} + +private: + shash_t* _table = nullptr; + shash_entry_t* _entry = nullptr; +}; + +// Provides an iterator over type-safe shash entries in an shash table. +template +struct shash_entry_iterator +{ + using self_type = shash_entry_iterator; + using value_type = std::remove_cv_t; + using difference_type = std::ptrdiff_t; + using pointer = ValueType*; + using reference = ValueType&; + using iterator_category = std::random_access_iterator_tag; + + shash_entry_iterator() = default; + shash_entry_iterator(self_type const& rhs) = default; + self_type& operator=(self_type const& rhs) = default; + + friend bool operator == (shash_entry_iterator const &lhs, + shash_entry_iterator const& rhs) { + return lhs._current == rhs._current; + } + friend bool operator != (shash_entry_iterator const &lhs, + shash_entry_iterator const& rhs) { + return lhs._current != rhs._current; + } + + self_type& operator++() { ++_current; return *this; } + self_type operator++(int) { return self_type(value_type{_current.table(), static_cast(_current++)}); } + + friend self_type operator+(self_type const &lhs, difference_type n) { + return self_type(lhs._current.table(), static_cast(lhs._current)+n); + } + + friend difference_type operator-(self_type const& lhs, self_type const& rhs) { + if (lhs._current.table() != rhs._current.table()) { + throw std::runtime_error("incompatible iterators"); + } + return static_cast(lhs._current) - static_cast(rhs._current); + } + + reference operator*() const { return _current; } + pointer operator->() const { return &_current; } + +private: + shash_entry_iterator(shash_t *table, shash_entry_t* entry) : _current{table, entry} {} + explicit shash_entry_iterator(value_type const &entry) : _current{entry} {} + + mutable value_type _current; + template + friend struct shash_table; +}; + +// Provide an associative container of type-safe shash_entries. +template +struct shash_table +{ + using mapped_type = shash_entry; + using const_iterator = shash_entry_iterator; + using iterator = shash_entry_iterator; + using mapped_value_type = typename mapped_type::value_type; + using key_type = typename mapped_type::key_type; + using value_type = std::pair; + using size_type = unsigned; + + iterator begin() { return begin_internal(); } + iterator end() { return end_internal();} + const_iterator cbegin() const { return begin_internal(); } + const_iterator cend() const { return end_internal(); } + const_iterator begin() const { return cbegin(); } + const_iterator end() const { return cend();} + size_type size() const { + return static_cast(cend()-cbegin()); + } + bool empty() const {return size() == 0;} + + // Creates a new heartbeat entry with the specified 'key', using an an initial value of zero. + mapped_type create(key_type key) { + return mapped_type{_table, shash_insert(_table, key, 0)}; + } + // Creates a new heartbeat entry with the specified 'key' and the specified 'value'. + mapped_type create(key_type key, mapped_value_type value) { + if (key == 0) { + throw std::runtime_error("You are not permitted to use key 0! Zero is a special value."); + } + return mapped_type{_table, shash_insert(_table, key, static_cast(value))}; + } + // Insert the specified 'value', returning a pair, the first element of which is an iterator pointing to the + // inserted element or the already existing element, the second element of which is a boolean value set to 'true' + // if the element is newly inserted, or 'false' if the element existed. Note the existing elements remain + // unchanged by this operation. + std::pair insert(const value_type &value) { + if (value.first == 0) { + throw std::runtime_error("You are not permitted to use key 0! Zero is a special value."); + } + auto entry = shash_insert(_table, value.first, static_cast(value.second.value())); + if (entry) { + return {iterator{_table, entry}, true}; + } + entry = shash_get(_table, value.first); + return {iterator{_table, entry}, false}; + } + // Insert a range of elements beginning with the specified 'first', and stopping at, though not including, the + // specified 'last'. Note the the iterators must point to objects conforming to the 'shash_entry' + // concept, having a the two member functions 'key()' and 'value()' whose signatures match. + template + void insert(InputIt first, InputIt last) { + auto&& self = *this; + std::for_each(first, last, [&self](auto entry){self.create(entry.key(), entry.value());}); + } + // Return the mapped value for the specified 'key'. + // Throws: out_of_range if no such key exists in the table. + // runtime_error if this table does not exist. + mapped_type at(key_type key) { + auto found = shash_get(_table, key); + if (found == nullptr) { + throw std::out_of_range(stringer::str("Entry with key ", key, + " does not exist.", + " Has the process writing to this iqueue started yet?")); + } + return mapped_type{_table, found}; + } + const_iterator find(key_type key) const { return find_internal(key); } + iterator find(key_type key) { return find_internal(key); } + + shash_table(shash_table const &other) = default; + shash_table(shash_table&& other) noexcept = default; + shash_table() = delete; + shash_table &operator = (shash_table const &rhs) = default; + ~shash_table() {_table = nullptr;} +private: + template + IteratorType begin_internal() const { + size_type num_entries; + return IteratorType{_table, shash_entries(_table, &num_entries)}; + } + template + IteratorType end_internal() const { + size_type num_entries; + auto end = shash_entries(_table, &num_entries); + // Heartbeat keys are arranged contiguously. A key of zero signifies an empty entry in the table, and hence + // the end of the populated area of the table. + for (size_type i = 0; i < num_entries; ++i) { + if (end->key == 0) { + break; + } + ++end; + } + return IteratorType{_table, end}; + } + template + IteratorType find_internal(key_type key) const { + auto found = shash_get(_table, key); + if (found == nullptr) { + return end_internal(); + } + return IteratorType{_table, found}; + } + friend struct iqueue; + explicit shash_table(shash_t *table) : _table{table} { } + + shash_t* _table; +}; + +// Provides a wrapper for 'high_resolution_clock::time_point' that adds conversion to/from uint64_t. +struct uint64_convertible_time_point : std::chrono::high_resolution_clock::time_point +{ + template + static uint64_t to_nanos(std::chrono::time_point timestamp) { + return static_cast( + std::chrono::time_point_cast(timestamp).time_since_epoch().count()); + } + using base_type = std::chrono::high_resolution_clock::time_point; + using base_type::time_point; + // We want to implicitly convert from the base type. + uint64_convertible_time_point(base_type t) : base_type(t) {} + explicit uint64_convertible_time_point(uint64_t value) : base_type(std::chrono::nanoseconds(value)) {} + explicit operator uint64_t () const { return to_nanos(*this); } +}; + +using shash_heartbeat_entry = shash_entry; +using shash_heartbeat_table = shash_table; + +// Provides a container of byte spans wrapper around an iqueue, with additional type safetey. +struct iqueue +{ + using const_iterator = iqueue_const_iterator; + using value_type = gsl::span; + using size_type = uint64_t; + + // Open the existing iqueue the specified 'filename', and the specified 'access', optionally checking that the + // existing header matches the specified 'user_header'. + iqueue(char const* filename, access_mode access, gsl::span user_header = {}) + : _iqueue(iqueue_open(filename, access == access_mode::write)) { + using namespace std::string_literals; + + if (_iqueue == nullptr) { + throw std::runtime_error("failed to open iqueue: \""s + filename + "\""); + } + if (!user_header.empty()) { + if (user_header != header()) { + throw std::runtime_error("iqueue header doesn't match expectation!"); + } + } + } + + // Create a new iqueue or open an existing one with the specified 'filename', using the specified 'creation' time, + // and the specified 'user_header'. + template + iqueue(char const* filename, std::chrono::time_point creation, gsl::span user_header) + : _iqueue(iqueue_create(filename, uint64_convertible_time_point::to_nanos(creation), + user_header.data(), user_header.size())) { + using namespace std::string_literals; + + if (_iqueue == nullptr) { + throw std::runtime_error("failed to create iqueue: \""s + filename + "\""); + } + } + + // accept already-open C iqueue + explicit iqueue(iqueue_t* iqueue_) + : _iqueue(iqueue_) + {} + + // release the C iqueue inside to the user + iqueue_t* release() { + auto ret = _iqueue; + _iqueue = nullptr; + return ret; + } + + iqueue(iqueue&& rhs) noexcept : _iqueue(rhs._iqueue) { rhs._iqueue = nullptr; } + + iqueue& operator=(iqueue&& rhs) noexcept { + if (_iqueue != nullptr) { + iqueue_close(_iqueue); + } + _iqueue = rhs._iqueue; + rhs._iqueue = nullptr; + return *this; + } + + // non-copyable + iqueue(iqueue const&) = delete; + iqueue& operator=(iqueue const& rhs) = delete; + + const_iterator begin() const { return const_iterator(_iqueue, iqueue_begin(_iqueue)); } + // Returns an iterator pointing to the current end of this iqueue. Note that the end of the iqueue may change + // without invalidating any outstanding iterators. + const_iterator end() const { return const_iterator(_iqueue, iqueue_end(_iqueue)); } + const_iterator cbegin() const { return begin(); } + const_iterator cend() const { return end(); } + + size_type size() const { return iqueue_entries(_iqueue); } + + ~iqueue() { if (_iqueue) iqueue_close(_iqueue); } + + gsl::span header() const { + std::size_t size = 0; + auto data = iqueue_header(_iqueue, &size); + return {reinterpret_cast(data), static_cast(size)}; + } + + // Returns the file name of the iqueue. + std::string name() const { + return iqueue_name(_iqueue); + } + + void append(value_type msg) { + int const ret = iqueue_append(_iqueue, msg.data(), msg.size()); + if (ret != 0) { + throw std::runtime_error(stringer::str( + "failed to append message (of ", msg.size(), " bytes) ", + "to iqueue \"", iqueue_name(_iqueue), "\":", ret)); + } + } + + // Check if the specified 'position' is ready to be read. + bool is_ready(const_iterator position) const { + auto const status = iqueue_status(_iqueue, position._id); + if (status != IQUEUE_STATUS_HAS_DATA && status != IQUEUE_STATUS_NO_DATA) { + throw std::runtime_error(stringer::str( + "iqueue status for index ", position._id, " was error ", status)); + } + return status == IQUEUE_STATUS_HAS_DATA; + } + + // Wait until the specified 'position' in the iqueue is available to be read, or until the specified 'timeout_ns' + // has elapsed, returning 'true' if data is available to read or 'false' otherwise. + bool wait(const_iterator position, std::chrono::nanoseconds timeout_ns) const { + auto const status = iqueue_status_wait(_iqueue, position._id, static_cast(timeout_ns.count())); + if (status != IQUEUE_STATUS_HAS_DATA && status != IQUEUE_STATUS_NO_DATA) { + throw std::runtime_error(stringer::str( + "posdelta iqueue status for index ", position._id, + " was error ", status)); + } + return status == IQUEUE_STATUS_HAS_DATA; + } + + // Wait until the specified 'position' in the iqueue is available to be read. + void wait(iqueue_const_iterator position) { + int const status = iqueue_status_wait(_iqueue, position._id, -1); + if (status != IQUEUE_STATUS_HAS_DATA && status != IQUEUE_STATUS_NO_DATA) { + throw std::runtime_error(stringer::str( + "posdelta iqueue status for index ", position._id, + " was error ", status)); + } + } + + // Get the heartbeat table, creating it if the specified 'mode' is 'access_mode::write', or else expect the + // table to exist and return an 'nullopt' if it does not. + std::optional heartbeat_table(access_mode mode) { + auto table = iqueue_writer_table(_iqueue, 0, mode == access_mode::write ? 1 : 0); + if (table == nullptr) { + return std::nullopt; + } + return std::make_optional(shash_heartbeat_table{table}); + } + + // Create the heartbeat entry in the heartbeat table with the specified + // 'heartbeat_key', throwing an exception if the key already exists. Creates + // the heartbeat table if it doesn't already exist. + shash_heartbeat_entry create_heartbeat_entry(shash_heartbeat_entry::key_type heartbeat_key) { + shash_t* table = iqueue_writer_table(_iqueue, 0, 1); + if (!table) { + throw std::runtime_error("Failed to create heartbeat table"); + } + return shash_heartbeat_table(table).create(heartbeat_key); + } + + // Get the heartbeat entry in the heartbeat table with the specified + // 'heartbeat_key', throwing an exception if the key or table do not exist. + shash_heartbeat_entry get_heartbeat_entry(shash_heartbeat_entry::key_type heartbeat_key) { + shash_t* table = iqueue_writer_table(_iqueue, 0, 0); + if (table) { + return shash_heartbeat_table(table).at(heartbeat_key); + } else { + throw std::runtime_error("Heartbeat table does not exist, has the writing process started?"); + } + } + + explicit operator iqueue_t*() const { return _iqueue; } + +private: + iqueue_t* _iqueue; +}; + +}}} + diff --git a/include/shash.h b/include/shash.h index 6fd598a..f4720d6 100644 --- a/include/shash.h +++ b/include/shash.h @@ -1,3 +1,18 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ /** \file * Shared simple hash table. * diff --git a/include/stringer.hh b/include/stringer.hh new file mode 100644 index 0000000..dbd6508 --- /dev/null +++ b/include/stringer.hh @@ -0,0 +1,31 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include + +namespace stringer { + +template +inline std::string str(Args const&... args) +{ + std::ostringstream stream; + int unpack[]{0, ((stream << args), 0 )...}; + // can't use __USE, unpack is an array, and __USE does &, which isn't valid on arrays + static_cast(unpack); + return stream.str(); +} + +} diff --git a/iqueue.pc.in b/iqueue.pc.in new file mode 100644 index 0000000..6d9aefa --- /dev/null +++ b/iqueue.pc.in @@ -0,0 +1,10 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: iqueue +Description: The complete iqueue library +Version: 0.1.0 +Cflags: -I${includedir} +Libs: -L${libdir} -liqueue -lpthread diff --git a/src/container_of.h b/src/container_of.h new file mode 100644 index 0000000..9ffae39 --- /dev/null +++ b/src/container_of.h @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _CONTAINER_OF_H_ +#define _CONTAINER_OF_H_ + +#include "twosigma.h" + +__BEGIN_DECLS + +/** + * container_of - cast a member of a structure out to the containing structure + * param #1: the pointer to the member. + * param #2: the type of the container struct this is embedded in. + * param #3: the name of the member within the struct. + */ +#define container_of(ptr, type, member) ({ \ + /* check the type of `member' is correct given `ptr' */ \ + const __typeof__ (__REINTERPRET_CAST(type*, 0)->member) *__mptr = (ptr); \ + /* do the actual offset-and-cast */ \ + __REINTERPRET_CAST(type*, (__REINTERPRET_CAST(uintptr_t, __mptr) - offsetof(type,member)));}) + +__END_DECLS + +#endif diff --git a/src/dump_iqueue.cc b/src/dump_iqueue.cc new file mode 100644 index 0000000..8bde14e --- /dev/null +++ b/src/dump_iqueue.cc @@ -0,0 +1,64 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include "try_unix.hh" +#include "iqueue.hh" + +struct options { + char *iqueue_path; + int outfd; +}; + +struct options get_options(int argc, char **argv) +try { + if (argc != 2) throw std::runtime_error("Incorrect number of arguments"); + int outfd = 1; + return { + .iqueue_path = argv[1], + .outfd = outfd, + }; +} catch (std::exception const& e) { + warnx("Usage: %s ", + (argc > 0 ? argv[0] : "dump_iqueue")); + warnx("%s", e.what()); + exit(1); +} + +int main(int argc, char **argv) { + struct options opt = get_options(argc, argv); + ts::mmia::cpputils::iqueue iq(opt.iqueue_path, ts::mmia::cpputils::access_mode::read_only); + auto it = iq.begin(); + for (;;) { + if (!iq.wait(it, std::chrono::nanoseconds(0))) { + exit(0); + } + gsl::span msg = *it; + if (msg.size() > 4096) { + warnx("Message is unrealistically huge. Truncating to 4096."); + msg = { msg.data(), 4096 }; + } + ssize_t written = try_(write(opt.outfd, reinterpret_cast(msg.data()), msg.size())); + if (written != msg.size()) { + warnx("Partial write? Did I get called with a non-packetized pipe?"); + exit(1); + } + ++it; + } +} diff --git a/src/getlogstr.c b/src/getlogstr.c new file mode 100644 index 0000000..3081609 --- /dev/null +++ b/src/getlogstr.c @@ -0,0 +1,208 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "twosigma.h" + +#include +#include +#include +#include +#include +#include +#include +#ifdef _REENTRANT +#include +#endif +#include +#include +#include +#include + +#include + +#include "tslog.h" +#include "tstl.h" + +#define BUF_SIZE 2048 +static tstl_t threadlocal_buf = TSTL_BUF_INITIALIZER(BUF_SIZE); + +static const char *oldlogstr = PREF_LONG; +static const char *newlogstr = DEFAULT_TSLOG_FORMAT; + +/* + * %%: % + * %D: Datetime+Zone + * %F: File + * %L: Line + * %N: Program name + * %P: File with Path + * %f: Function + * %T: Type + */ +int +setlogstr(const char *fmt) +{ + const char *p; + for (p = fmt; *p; p++) + if (*p == '%' && strchr("%DFLNPfT", *++p) == NULL) { + if (*p) + warnx("Unknown escape `%c'", *p); + else + warnx("Missing character after `%%'"); + return -1; + } + + oldlogstr = newlogstr = fmt; + return 0; +} + +static const char * +getlogstr0(const char *file, int line, const char *func, int type, const char *logstr) +{ + char datetime[64]; + char scratch[64]; + struct timeval tv; + struct tm tmb; + struct tm *tmp; + time_t t; + int oerrno = errno; + const char *p, *ptr; + char *buffer = tstl_get(&threadlocal_buf); + char *b = buffer, *eb = buffer + BUF_SIZE; + + assert(buffer != NULL); + + (void)gettimeofday(&tv, NULL); + t = (time_t)tv.tv_sec; + tmp = localtime_r(&t, &tmb); + if (tmp == NULL) + (void)snprintf(scratch, sizeof(scratch), "XXXX-XX-XX XX:XX:XX.XXX XXX"); + else + (void)strftime(scratch, sizeof(scratch), + "%Y-%m-%d %H:%M:%S.%%.3d %Z", tmp); + (void)snprintf(datetime, sizeof(datetime), scratch, tv.tv_usec / 1000); + +#define ADDC(c) \ + do { \ + if (b == eb) { \ + b--; \ + goto out; \ + } \ + *b++ = c; \ + } while (/*CONSTCOND*/0) + +#define ADDS(s) \ + do { \ + const char *_s = s; \ + while (*_s) { \ + if (b == eb) { \ + b--; \ + goto out; \ + } \ + *b++ = *_s++; \ + } \ + } while (/*CONSTCOND*/0) + + for (p = logstr; *p; p++) { + if (*p != '%') { + ADDC(*p); + continue; + } + switch (*++p) { + case '%': + ADDC(*p); + break; + case 'D': + ADDS(datetime); + break; + case 'F': + if ((ptr = strrchr(file, '/')) != NULL) + ptr++; + else + ptr = file; + ADDS(ptr); + break; + case 'L': + (void)snprintf(scratch, sizeof(scratch), "%d", line); + ADDS(scratch); + break; + case 'N': + ADDS(getprogname()); + break; + case 'P': + ADDS(file); + break; + case 'f': + ADDS(func); + break; + case 'T': + switch (type) { + case -1: + if (isspace((unsigned char)p[1])) + p++; + break; + case TSFATAL: + ADDS("ABORT"); + break; + case TSERROR: + ADDS("ERROR"); + break; + case TSWARN: + ADDS("WARN"); + break; + case TSINFO: + ADDS("INFO"); + break; + case TSDEBUG: + ADDS("DEBUG"); + break; + case TSDIAG: + ADDS("DIAG"); + break; + default: + warnx("unknown error type %d", type); + break; + } + break; + case 't': +#ifdef __linux__ + /* leave a noop case even if not __linux__ to avoid an abort on %t */ + (void)snprintf(scratch, sizeof(scratch), "%ld", (long) syscall(__NR_gettid)); + ADDS(scratch); +#endif + break; + default: + warnx("unknown log formatting character `%c'", *p); + break; + } + } +out: + ADDC('\0'); + errno = oerrno; + return buffer; +} + +const char * +getlogstr(const char *file, int line, const char *func, int type) +{ + return getlogstr0(file, line, func, type, oldlogstr); +} + +const char * +getnewlogstr(const char *file, int line, const char *func, int type) +{ + return getlogstr0(file, line, func, type, newlogstr); +} diff --git a/src/in2iqueue.cc b/src/in2iqueue.cc new file mode 100644 index 0000000..fd84dc2 --- /dev/null +++ b/src/in2iqueue.cc @@ -0,0 +1,88 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include "try_unix.hh" +#include "iqueue.hh" +#include +#include +#include +#include +using std::experimental::optional; +using std::experimental::nullopt; + +struct options { + char *iqueue_path; + int infd; + optional shash_key; +}; + +struct options get_options(int argc, char **argv) +try { + if ((argc != 2) && (argc != 3)) throw std::runtime_error("Incorrect number of arguments"); + int infd = 0; + try_(fcntl(infd, F_SETFL, try_(fcntl(infd, F_GETFL, 0)) & ~O_NONBLOCK)); + struct options opt = { + .iqueue_path = argv[1], + .infd = infd, + .shash_key = nullopt, + }; + if (argc == 3) { + opt.shash_key = std::stoul(argv[2], nullptr, 0); + } + return opt; +} catch (std::exception const& e) { + warnx("Usage: %s [shash_key]", + (argc > 0 ? argv[0] : "in2iqueue")); + warnx("%s", e.what()); + exit(1); +} + +gsl::span read_fd(int fd) { + static gsl::byte buf[4096]; + int ret = try_(read(fd, buf, sizeof(buf))); + if (ret == 0) exit(0); + return { buf, static_cast(ret) }; +} + +void fd2iqueue(int fd, ts::mmia::cpputils::iqueue& iq) { + for (;;) { + auto bytes = read_fd(fd); + iq.append(bytes); + } +} + +void update_heartbeat_loop(ts::mmia::cpputils::shash_heartbeat_entry entry) { + for (;;) { + entry.update(std::chrono::high_resolution_clock::now()); + std::this_thread::sleep_for(std::chrono::nanoseconds(100)); + } +} + +int main(int argc, char **argv) { + struct options opt = get_options(argc, argv); + auto iq = ts::mmia::cpputils::iqueue(opt.iqueue_path, ts::mmia::cpputils::access_mode::write); + optional heartbeat_thread = nullopt; + if (opt.shash_key) { + auto entry = iq.create_heartbeat_entry(*opt.shash_key); + entry.update(std::chrono::high_resolution_clock::now()); + heartbeat_thread = std::thread(update_heartbeat_loop, entry); + } + fd2iqueue(opt.infd, iq); +} diff --git a/src/io_utils.c b/src/io_utils.c new file mode 100644 index 0000000..dc9fb26 --- /dev/null +++ b/src/io_utils.c @@ -0,0 +1,107 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "io_utils.h" +#include +#include +#include +#include + +ssize_t +write_all( + int fd, + const void * const buf_ptr, + size_t len +) +{ + const uint8_t * buf = buf_ptr; + size_t offset = 0; + + while (1) + { + ssize_t wlen = write(fd, buf, len); + if (wlen == (ssize_t) len || wlen == 0) + return offset + wlen; + + if (wlen < 0) return -1; + + offset += wlen; + buf += wlen; + len -= wlen; + } +} + +int +iovadjust( + size_t adjust_len, + struct iovec ** iov_ptr, + int *entries_ptr +) +{ + int entries = *entries_ptr; + struct iovec * iov = *iov_ptr; + + while (1) + { + // If the adjustment is less than the entry, we're done + if (iov->iov_len > adjust_len) + { + iov->iov_len -= adjust_len; + iov->iov_base = adjust_len + (uint8_t *) iov->iov_base; + break; + } + + // This entry is entirely consumed; move to the next one + adjust_len -= iov->iov_len; + iov++; + + if (--entries > 0) + continue; + + return -1; + } + + // Write the new entry count and first iov pointer + *entries_ptr = entries; + *iov_ptr = iov; + return 0; +} + +ssize_t +writev_all( + int fd, + size_t total_len, + struct iovec * iov, + int entries +) +{ + size_t offset = 0; + + while (1) + { + ssize_t wlen = writev(fd, iov, entries); + if (wlen == (ssize_t) total_len || wlen == 0) + return offset + wlen; + + if (wlen < 0) return -1; + + // Adjust the iov to find where to start again + total_len -= wlen; + offset += wlen; + + if (iovadjust(wlen, &iov, &entries) < 0) + return -1; + } +} diff --git a/src/io_utils.h b/src/io_utils.h new file mode 100644 index 0000000..a84beb2 --- /dev/null +++ b/src/io_utils.h @@ -0,0 +1,61 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include + +/** + * Write len bytes to the fd, looping until all of them are written. + * + * \return -1 on error or the number of bytes actually written. + * If the file descriptor is closed during the write the number of + * byte will be less than len, otherwise the return should be + * the same as len. + */ +ssize_t +write_all( + int fd, + const void * buf_ptr, + size_t len +); + + +/** Advance an iov by adjust_len bytes. + * \return 0 on success, -1 on failure (insufficient data) + */ +int +iovadjust( + size_t adjust_len, + struct iovec ** iov_ptr, + int *entries_ptr +); + + +/** + * Write total_len bytes from the io vector, looping until all of + * them are written. + * \note iov might be modified if there are partial writes. + * \return -1 on error or the number of bytes actually written. + * If the file descriptor is closed during the writev the number of + * byte will be less than len, otherwise the return should be + * the same as len. + */ +ssize_t +writev_all( + int fd, + size_t total_len, + struct iovec * iov, + int entries +); diff --git a/src/iqmod_common.h b/src/iqmod_common.h new file mode 100644 index 0000000..588b247 --- /dev/null +++ b/src/iqmod_common.h @@ -0,0 +1,67 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _iqmod_common_h_ +#define _iqmod_common_h_ + +#include "twosigma.h" +#include "tsassert.h" + +#include +#include + +typedef struct { + uint8_t table; + uint64_t key; +} shash_coord_t; + +static uint64_t * +parse_tuple(size_t min_tokens, size_t max_tokens, const char *str, size_t *found_tokens, uint64_t default_value) +{ + uint64_t *list = malloc(max_tokens * sizeof(*list)); + tsassert(list); + char *buff = strdup(str); + tsassert(buff); + + *found_tokens = 0; + char *token; + char *saveptr = NULL; + while ((token = strtok_r(buff, ":", &saveptr))) { + if (*found_tokens >= max_tokens) { + errx(1, "Expected at most `%zu` items in `%s`, found `%zu`", max_tokens, str, *found_tokens+1); + } + buff = NULL; + errno = 0; + char *endptr; + list[(*found_tokens)++] = strtoul(token, &endptr, 0); + if (errno) { + err(1, "Strtol failed parsing argument `%s`", str); + } + if (*endptr != '\0') { + errx(1, "Strtol found text at the end of a number in `%s`", str); + } + } + + if (*found_tokens < min_tokens) { + errx(1, "Expected at least `%zu` items in `%s`, found `%zu`", min_tokens, str, *found_tokens); + } + + for (size_t i = *found_tokens; i < max_tokens; ++i) { + list[i] = default_value; + } + + return list; +} +#endif diff --git a/src/iqmod_copy-main.c b/src/iqmod_copy-main.c new file mode 100644 index 0000000..ac6c024 --- /dev/null +++ b/src/iqmod_copy-main.c @@ -0,0 +1,470 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" +#include "tsassert.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iqueue.h" +#include "iqmod_common.h" +#include "tsclock.h" + +// Absolute minimum number of files that can be open at once. +#define MAX_UNWARNED_FILES _POSIX_OPEN_MAX + +#define TSFLEXHASH_NAME shash_mod_table +#define TSFLEXHASH_KEY_TYPE shash_coord_t +#define TSFLEXHASH_VAL_TYPE uint64_t +#define TSFLEXHASH_KEY_HASH(key) (key.table ^ key.key) +#define TSFLEXHASH_KEY_EQUALS(k1, k2) (k1.table == k2.table && k1.key == k2.key) +#include "tsflexhash.h" + +#define TSFLEXHASH_NAME msg_mod_table +#define TSFLEXHASH_KEY_TYPE uint64_t +#define TSFLEXHASH_VAL_TYPE uint64_t +#include "tsflexhash.h" + +static void +print_help(char **argv) +{ + fprintf(stderr, +"\n" +"iqmod_copy - create a modified copy of an iqueue.\n" +"usage: %s [OPTIONS] input.iqx output.iqx\n" +"\n" +"DESCRIPTION:\n" +"iqmode creates a copy of a given iqueue with added, removed, or edited messages, header, or shash entries.\n" +"Messages to be edited or inserted will be dumped to a binary file, which can be edited as necessary, and\n" +"will be automatically loaded back into the new iqueue. The binary files will be stored under `.mod/`\n" +"\n" +"OPTIONS: (repeatable - numbers in oct, dec, or hex)\n" +"\t-h This help.\n" +"\t-t Edit the header.\n" +"\t-i [:] Insert new messages between and (files -1.insert to -.insert).\n" +"\t-d [:] Drop messages to .\n" +"\t-e [:] Edit messages to . (files .edit to .edit)\n" +"\t-s :: Override the original shash, adding if none. The value can be smaller than the input one.\n" +"\t-x
: Drop the original shash, if any.\n" +"\n" +"EXAMPLES:\n" +"Remove messages 10-100 inclusive, and edit header:\n" +"\t iqmod_copy -t -d 10:101\n" +"Insert 2 messages at position 10 and 11, so the output will have 0-9 + new1 + new2 + 10-last:\n" +"\t iqmod_copy -i 10:2\n" +"Edit message 307 and drop shash in table 0 and slot 123:\n" +"\t iqmod_copy -e 307 -x 0:123\n" +"\n" + , basename(argv[0])); +} + +static void +mod_shashes( + int8_t table, + iqueue_t *input, iqueue_t *output, + shash_mod_table_t *set_map, + shash_mod_table_t *drop_map) +{ + shash_t *input_table = iqueue_writer_table(input, table, 0); + shash_t *output_table = NULL; + + // Add all the user-specified keys. + shash_mod_table_iterator_t it; + shash_mod_table_iterator_init(set_map, &it); + for (;;) { + shash_coord_t key; + uint64_t *val = shash_mod_table_iterator_next(set_map, &it, &key); + if (!val) { + break; + } + if (key.table != table) { + // Skip entries for other tables. + continue; + } + if (!output_table) { + // Now we know we need the table, create it if it's missing. + output_table = iqueue_writer_table(output, table, 1); + } + shash_insert(output_table, key.key, *val); + } + + if (!input_table) { + return; + } + + // Copy all the existing entries, possibly modifying or dropping some. + unsigned shash_len; + shash_entry_t *entries = shash_entries(input_table, &shash_len); + for (unsigned i = 0; i < shash_len; ++i) { + shash_coord_t coord = (shash_coord_t){table, entries[i].key}; + uint64_t *op_set = shash_mod_table_get(set_map, coord); + uint64_t *op_drop = shash_mod_table_get(drop_map, coord); + if (op_set && op_drop) { + errx(1, "Cannot both set and drop shash %d:%"PRIu64, table, entries[i].key); + } + + if (op_drop || op_set) { + continue; + } + if (!entries[i].key) { + // Skip key == 0, which is used internally by iqueue. + continue; + } + if (!output_table) { + // Now we know we need the table, create it if it's missing. + output_table = iqueue_writer_table(output, table, 1); + } + + // Try to insert. If we set the value with the user-provided data, insert won't change it. + shash_insert(output_table, entries[i].key, entries[i].value); + } +} + +static const char * +get_insert_buffer_path( + const char *edit_path, + uint64_t idx, + uint64_t cnt) +{ + char *file_path; + tsassert(asprintf(&file_path, "%s/%"PRIu64"-%"PRIu64".insert", edit_path, idx, cnt) >= 0); + return file_path; +} + +static const char * +get_edit_buffer_path( + const char *edit_path, + uint64_t idx) +{ + char *file_path; + tsassert(asprintf(&file_path, "%s/%"PRIu64".edit", edit_path, idx) >= 0); + return file_path; +} + +static const char * +get_header_buffer_path( + const char *edit_path) +{ + char *file_path; + tsassert(asprintf(&file_path, "%s/header", edit_path) >= 0); + return file_path; +} + +// Write a message (from iqueue, or new) to an editable buffer file. +static void +write_editable_buffer( + const char *buffer_path, + const void *msg, + size_t msg_len) +{ + int file; + if ((file = creat(buffer_path, S_IRWXU)) == -1) { + err(1, "Could not create file `%s`", buffer_path); + } + if (write(file, msg, msg_len) == -1) { + err(1, "Could not write to file `%s`", buffer_path); + } + if (close(file) == -1) { + err(1, "Could not close file `%s`", buffer_path); + } +} + +// Load the content of an editable buffer file. +static const void * +read_editable_buffer( + const char *buffer_path, + size_t *msg_len) +{ + int file; + if ((file = open(buffer_path, O_RDONLY)) == -1) { + err(1, "Could not open file `%s`", buffer_path); + } + struct stat file_stats; + if (fstat(file, &file_stats) == -1) { + err(1, "Could get size of file `%s`", buffer_path); + } + *msg_len = file_stats.st_size; + const void *msg = mmap(0, *msg_len, PROT_READ, MAP_PRIVATE, file, 0); + if (msg == MAP_FAILED) { + err(1, "Failed mapping file `%s`", buffer_path); + } + return msg; +} + +static iqueue_t * +mod_content( + const char *edit_path, + iqueue_t *input, + const char *output_path, + bool edit_header, + msg_mod_table_t *insert_map, + msg_mod_table_t *drop_map, + msg_mod_table_t *edit_map) +{ + uint64_t input_len = iqueue_entries(input); + bool wait_for_edits = false; + size_t header_len; + const char *header = iqueue_header(input, &header_len); + + // Create the editable buffer file for the header, if it needs to be changed. + if (edit_header) { + wait_for_edits = true; + write_editable_buffer(get_header_buffer_path(edit_path), header, header_len); + } + + // Create all the editable buffer files for insert and edit. + { + msg_mod_table_iterator_t it; + msg_mod_table_iterator_init(insert_map, &it); + for (;;) { + uint64_t idx; + uint64_t *add_count = msg_mod_table_iterator_next(insert_map, &it, &idx); + if (!add_count) { + break; + } + if (idx > input_len) { + errx(1, "Cannot insert at %"PRIu64", input lenght is %"PRIu64, idx, input_len); + } + wait_for_edits = true; + for (size_t i = 1; i <= *add_count; ++i) { + write_editable_buffer(get_insert_buffer_path(edit_path, idx, i), NULL, 0); + } + } + } + { + msg_mod_table_iterator_t it; + msg_mod_table_iterator_init(edit_map, &it); + for (;;) { + uint64_t idx; + uint64_t *toedit = msg_mod_table_iterator_next(edit_map, &it, &idx); + if (!toedit) { + break; + } + if (idx >= input_len) { + errx(1, "Cannot edit %"PRIu64", input lenght is %"PRIu64, idx, input_len); + } + wait_for_edits = true; + size_t msg_len; + const void *msg = iqueue_data(input, idx, &msg_len); + if (!msg) { + errx(1, "Could not read iqueue message %"PRIu64, idx); + } + write_editable_buffer(get_edit_buffer_path(edit_path, idx), msg, msg_len); + madvise(__UNCONST_T(void *, msg), msg_len, MADV_DONTNEED); + } + } + + // Wait for the user to confirm that she is done editing. + if (wait_for_edits) { + printf("Data ready to be edited under `%s`. Press ENTER when done editing.", edit_path); + int char_read; + while ((char_read = getchar()) != -1 && char_read != '\n'); + if (char_read == -1) { + errx(1, "Failed reading from stdin"); + } + } + + // Create the new iqueue. + if (edit_header) { + const char *header_buffer_file = get_header_buffer_path(edit_path); + header = read_editable_buffer(header_buffer_file, &header_len); + unlink(header_buffer_file); + } + + uint64_t creation_time = tsclock_getnanos(0); + iqueue_t *output = iqueue_create(output_path, creation_time, header, header_len); + if (!output) { + err(1, "Could not create output iqx `%s`\n", output_path); + } + if (iqueue_creation(output) != creation_time) { + errx(1, "Output iqx `%s` already exist, please give a new path.", output_path); + } + + // Copy all the messages from the input iqueue with the necessary changes. + for (uint64_t idx = 0; idx <= input_len; ++idx) { + uint64_t *op_insert = msg_mod_table_get(insert_map, idx); + uint64_t *op_edit = msg_mod_table_get(edit_map, idx); + uint64_t *op_drop = msg_mod_table_get(drop_map, idx); + if (op_edit && op_drop) { + errx(1, "Cannot both edit and drop message %"PRIu64, idx); + } + + // Pick up the messages that will be inserted at this point. + if (op_insert) { + for (uint64_t i = 1; i <= *op_insert; ++i) { + const char *buffer_file = get_insert_buffer_path(edit_path, idx, i); + size_t msg_len; + const void *msg = read_editable_buffer(buffer_file, &msg_len); + tsassert(!iqueue_append(output, msg, msg_len)); + madvise(__UNCONST_T(void *, msg), msg_len, MADV_DONTNEED); + unlink(buffer_file); + } + } + + if (idx >= input_len) { + break; + } + + // Skip copying the current message if dropping. + if (op_drop) { + continue; + } + + // Pick up the edited message to use for this position. + if (op_edit) { + const char *buffer_file = get_edit_buffer_path(edit_path, idx); + size_t msg_len; + const void *msg = read_editable_buffer(buffer_file, &msg_len); + tsassert(!iqueue_append(output, msg, msg_len)); + madvise(__UNCONST_T(void *, msg), msg_len, MADV_DONTNEED); + unlink(buffer_file); + continue; + } + + // None of the other options apply, just copy verbatim from the input. + size_t msg_len; + const char *msg = iqueue_data(input, idx, &msg_len); + tsassert(!iqueue_append(output, msg, msg_len)); + madvise(__UNCONST_T(void *, msg), msg_len, MADV_DONTNEED); + } + + return output; +} + +int +main(int argc, char **argv) +{ + bool edit_header = false; + + size_t found_tokens; + uint64_t *token_list; + + shash_mod_table_t *shash_set_map = shash_mod_table_create(32); + shash_mod_table_t *shash_drop_map = shash_mod_table_create(32); + + msg_mod_table_t *msg_insert_map = msg_mod_table_create(32); + msg_mod_table_t *msg_edit_map = msg_mod_table_create(32); + msg_mod_table_t *msg_drop_map = msg_mod_table_create(32); + + shash_coord_t coord; + uint64_t *table_value; + int option = -1; + while ((option = getopt(argc, argv, "hi:d:e:ts:x:")) != -1) { + switch (option) { + case 'h': + print_help(argv); + return EXIT_FAILURE; + case 'i': + token_list = parse_tuple(1, 2, optarg, &found_tokens, 1); + table_value = msg_mod_table_insert(msg_insert_map, token_list[0]); + *table_value = token_list[1]; + break; + case 'e': + token_list = parse_tuple(1, 2, optarg, &found_tokens, 1); + for (size_t i = 0; i < token_list[1]; ++i) { + msg_mod_table_insert(msg_edit_map, token_list[0]+i); + } + break; + case 'd': + token_list = parse_tuple(1, 2, optarg, &found_tokens, 1); + for (size_t i = 0; i < token_list[1]; ++i) { + msg_mod_table_insert(msg_drop_map, token_list[0]+i); + } + break; + case 't': + edit_header = true; + break; + case 's': + token_list = parse_tuple(3, 3, optarg, &found_tokens, 0); + coord = (shash_coord_t){token_list[0], token_list[1]}; + table_value = shash_mod_table_insert(shash_set_map, coord); + *table_value = token_list[2]; + break; + case 'x': + token_list = parse_tuple(2, 2, optarg, &found_tokens, 0); + coord = (shash_coord_t){token_list[0], token_list[1]}; + shash_mod_table_insert(shash_drop_map, coord); + break; + default: + fprintf(stderr, "Uknown option `%c` (%d)\n", option, option); + print_help(argv); + return EXIT_FAILURE; + } + } + + // Warn if the user is trying to edit too many messages at once. + if (msg_mod_table_size(msg_insert_map) + msg_mod_table_size(msg_edit_map) > MAX_UNWARNED_FILES) { + printf("Trying to insert and edit `%"PRIu32"` messages. Ctrl-C to stop here, ENTER to continue\n", + msg_mod_table_size(msg_insert_map) + msg_mod_table_size(msg_edit_map)); + int char_read; + while ((char_read = getchar()) != -1 && char_read != '\n'); + if (char_read == -1) { + errx(1, "Failed reading from stdin"); + } + } + + if (optind == 1) { + fprintf(stderr, "Expecting at least 1 editing option, but found none\n"); + print_help(argv); + return EXIT_FAILURE; + } + + if (argc - optind != 2) { + fprintf(stderr, "Expecting exactly 2 non-option arguments, found `%d`\n", argc - optind); + print_help(argv); + return EXIT_FAILURE; + } + + const char *input_path = argv[optind]; + const char *output_path = argv[optind+1]; + + char *edit_path; + if (asprintf(&edit_path, "%s.mod", input_path) == -1) { + errx(1, "Could not create path for edit folder `%s`", edit_path); + } + + iqueue_t *input = iqueue_open(input_path, false); + if (!input) { + err(1, "Could not open input iqx `%s`", input_path); + } + + if (mkdir(edit_path, S_IRWXU)) { + errx(1, "Could not create edit folder `%s`", edit_path); + } + printf("Creating edit folder at `%s`. Will erase on success.\n", edit_path); + + // Copy the modified messages first, as creation requires the new header. + iqueue_t *output = mod_content(edit_path, input, output_path, edit_header, msg_insert_map, msg_drop_map, msg_edit_map); + + // Copy the modified shashes (iqueue uses tables 0-3). + for (size_t i = 0; i <= 3; ++i) { + mod_shashes(i, input, output, shash_set_map, shash_drop_map); + } + + rmdir(edit_path); + return EXIT_SUCCESS; +} diff --git a/src/iqmod_inplace-main.c b/src/iqmod_inplace-main.c new file mode 100644 index 0000000..2afaaa4 --- /dev/null +++ b/src/iqmod_inplace-main.c @@ -0,0 +1,148 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" +#include "tsassert.h" + +#include +#include +#include +#include +#include +#include "iqueue.h" +#include "iqmod_common.h" + +#define TSFLEXHASH_NAME shash_mod_table +#define TSFLEXHASH_KEY_TYPE shash_coord_t +#define TSFLEXHASH_VAL_TYPE uint64_t +#define TSFLEXHASH_KEY_HASH(key) (key.table ^ key.key) +#define TSFLEXHASH_KEY_EQUALS(k1, k2) (k1.table == k2.table && k1.key == k2.key) +#include "tsflexhash.h" + +static void +print_help(char **argv) +{ + fprintf(stderr, +"\n" +"iqmod_inplace - modify an iqueue in place.\n" +"usage: %s [OPTIONS] input.iqx\n" +"\n" +"DESCRIPTION:\n" +"iqmod_inplace modifies a given iqueue's creation_time or shash entries.\n" +"\n" +"OPTIONS: (repeatable - numbers in oct, dec, or hex)\n" +"\t-h This help.\n" +"\t-c Creation time in decimal nanos.\n" +"\t-s
:: Override the original shash, adding if none.\n" +"\n" +"EXAMPLES:\n" +"Modify creation time:\n" +"\t iqmod_inplace -c 1500303647133731422 test.iqx\n" +"Modify shash in table 0(heartbeat table) and slot 123:\n" +"\t iqmod_inplace -s 0:123:1500305470665041605\n" +"Modify shash in table 1(iqsync source table) and slot 1500305470665041605:\n" +"\t iqmod_inplace -s 1:1500305470665041605:98 test.iqx\n" +"\n" + , basename(argv[0])); +} + +static void +mod_shashes( + iqueue_t *input, + shash_mod_table_t *set_map) +{ + shash_t *input_table; + + shash_mod_table_iterator_t it; + shash_mod_table_iterator_init(set_map, &it); + for (;;) { + shash_coord_t key; + uint64_t *val = shash_mod_table_iterator_next(set_map, &it, &key); + if (!val) { + break; + } + // Now we know we need the table, create it if it's missing. + input_table = iqueue_writer_table(input, key.table, 1); + shash_entry_t *entry = shash_get(input_table, key.key); + if (entry == NULL) { + shash_insert(input_table, key.key, *val); + } else { + shash_update(input_table, entry, entry->value, *val); + } + } +} + +int +main(int argc, char **argv) +{ + size_t found_tokens; + uint64_t *token_list; + + shash_mod_table_t *shash_set_map = shash_mod_table_create(32); + uint64_t creation_time = 0; + + int option = -1; + while ((option = getopt(argc, argv, "hc:s:")) != -1) { + switch (option) { + case 'h': + print_help(argv); + return EXIT_FAILURE; + case 'c': + token_list = parse_tuple(1, 1, optarg, &found_tokens, 0); + creation_time = token_list[0]; + break; + case 's': + { + token_list = parse_tuple(3, 3, optarg, &found_tokens, 0); + shash_coord_t coord = (shash_coord_t){token_list[0], token_list[1]}; + uint64_t *table_value = shash_mod_table_insert(shash_set_map, coord); + *table_value = token_list[2]; + break; + } + default: + fprintf(stderr, "Uknown option `%c` (%d)\n", option, option); + print_help(argv); + return EXIT_FAILURE; + } + } + + if (optind == 1) { + fprintf(stderr, "Expecting at least 1 editing option, but found none\n"); + print_help(argv); + return EXIT_FAILURE; + } + + if (argc - optind != 1) { + fprintf(stderr, "Expecting exactly 1 non-option arguments, found `%d`\n", argc - optind); + print_help(argv); + return EXIT_FAILURE; + } + + const char *input_path = argv[optind]; + + iqueue_t *input = iqueue_open(input_path, true); + if (!input) { + err(1, "Could not open input iqx `%s`", input_path); + } + + if (creation_time != 0) { + if (iqueue_update_creation_time(input, creation_time) == -1) { + err(1, "Failed to modify creation_time for `%s`", input_path); + } + } + + mod_shashes(input, shash_set_map); + return EXIT_SUCCESS; +} diff --git a/src/iqsync-main.c b/src/iqsync-main.c index 4a5227a..3700749 100644 --- a/src/iqsync-main.c +++ b/src/iqsync-main.c @@ -1,16 +1,20 @@ -/* $TwoSigma: iqsync-main.c,v 1.6 2012/02/02 20:53:59 thudson Exp $ */ - /* - * Copyright (c) 2010 Two Sigma Investments, LLC - * All Rights Reserved + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF - * Two Sigma Investments, LLC. + * http://www.apache.org/licenses/LICENSE-2.0 * - * The copyright notice above does not evidence any - * actual or intended publication of such source code. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "twosigma.h" +#include #include #include #include @@ -20,42 +24,61 @@ #include #include #include -#include "tsutil.h" -#include "tsio.h" +#include +#include +#include "tslog.h" +#include "proc_utils.h" #include "tsclock.h" -#include "tsnet.h" +#include "net_utils.h" +#include +#include +#include +#include +#include +#include +#include "math_utils.h" #include "iqueue.h" #include "iqsync.h" -#include "segfault.h" - -__RCSID("$TwoSigma: iqsync-main.c,v 1.6 2012/02/02 20:53:59 thudson Exp $"); - +#include "tslock.h" /** \file * Push or pull changes from an iqueue with another over stdin/stdout * tunneled through ssh, or over a TCP socket. */ - static struct option long_options[] = { { "help", no_argument, 0, '?' }, { "iqueue", required_argument, 0, 'f' }, { "tail", no_argument, 0, 't' }, - { "server", required_argument, 0, 'R' }, + { "server", required_argument, 0, 'R' }, { "sleep", required_argument, 0, 's' }, - { "rate-limit", required_argument, 0, 'M' }, + { "rate-limit", required_argument, 0, 'M' }, { "report-interval", required_argument, 0, 'r' }, { "type", required_argument, 0, 'T' }, { "push", no_argument, 0, 'p' }, { "pull", no_argument, 0, 'P' }, { "nop", no_argument, 0, 'Z' }, - { "validate", no_argument, 0, 'V' }, - { "verbose", no_argument, 0, 'v' }, + { "validate", no_argument, 0, 'V' }, + { "verbose", no_argument, 0, 'v' }, + { "quiet", no_argument, 0, 'q' }, { "clone", no_argument, 0, 'C' }, { "remote-cpu", required_argument, 0, 'K' }, { "cpu", required_argument, 0, 'c' }, { "prefetch", no_argument, 0, 'e' }, { "syncbehind", no_argument, 0, 'b' }, + { "filter", required_argument, 0, 'F' }, + { "clone-push", no_argument, 0, 'O' }, + { "launch-server", no_argument, 0, 1 }, + { "connection-timeout", + required_argument, 0, 2 }, + { "launched-by-client", + no_argument, 0, 3 }, + { "send-buffer", no_argument, 0, 4 }, + { "send-buffer-size", + required_argument, 0, 5 }, + { "recv-buffer", no_argument, 0, 6 }, + { "recv-buffer-size", + required_argument, 0, 7 }, { 0, 0, 0, 0}, }; @@ -78,18 +101,35 @@ usage( " -r | --report-interval N Report TX/RX stats every N seconds\n" " -t | --tail Keep tracking entries as they are added\n" " -v | --verbose Report every TX/RX message\n" +" -q | --quiet Sets the log level to TSWARN\n" " -T | --type {ssh|tcp} Transport type (default ssh)\n" " -c | --cpu N Bind local push thread to CPU N\n" -" -K | --remote-cpu N Bind remote push thread to CPU N (only with ssh)\n" +" -K | --remote-cpu N Bind remote push thread to CPU N" +"(only with ssh)\n" " -e | --prefetch Create prefetch thread\n" " -b | --syncbehind Create syncbehind thread\n" +" -F | --filter Use filter in format :\n" +"\n" +" --send-buffer Messages will be aggregated into buffer\n" +" until the buffer gets full or no more messages available\n" +" --send-buffer-size Use send buffer of N bytes instead of default size(4KB)\n" +" --recv-buffer Reduce amount of IO on receiver side by using a buffer\n" +" --recv-buffer-size Use recv buffer of N bytes instead of default size(1MB)\n" "\n" "Cloning options:\n" " Cloning implies --pull and --validate. If the local sizes are not\n" " specified the remote sizes will be used. Bi-directional cloning\n" -" is supported.\n" +" is supported. clone-push implies --push and --validate.\n" "\n" " -C | --clone Clone a remote iqueue\n" +" -O | --clone-push Clone local iqueue to remote machine\n" +"\n" +"Options for TCP:\n" +" By default, client and server have to be launched separately. \n" +" 'iqsync --launch-server' will launch server from the client.\n" +"\n" +" --launch-server Start tcp server from client\n" +" --connection-timeout N Wait for server to start for N seconds\n" "\n" "\n" "Push/pull:\n" @@ -122,6 +162,7 @@ usage( "\n" "TCP Client Usage:\n" " iqsync --type tcp [options...] IPADDR:PORT\n" +" iqsync --type tcp [options...] --launch-server user@host:/path/to/remote.iqx\n" "\n" "To invoke from netcat, create a shell script to invoke iqsync and\n" "invoke it with \"--server unused --type ssh\". Netcat will wait for\n" @@ -134,78 +175,161 @@ usage( exit(EXIT_FAILURE); } +static int +iqsync_setup_filter( + iqsync_t * const iqsync, + const char *arg) +{ + char *filter_arg = strdup(arg); + char *colon = strchr(filter_arg, ':'); + if (!colon) + TSABORTX("badly formatted filter argument '%s', " + "MUST be in :\n", optarg); + + *colon = 0; + const char *filename = filter_arg; + const char *symbol = colon + 1; + + void *lib = dlopen(filename, RTLD_LAZY); + if (!lib) + TSABORT("could not open library '%s': %s", filename, dlerror()); + + iqsync_filter_setup_fn_t setup = dlsym(lib, symbol); + if (!setup) + TSABORT("could not find filter setup function '%s'" + "in library '%s'", + symbol, filename); + + iqsync->filter_count++; + iqsync->filters = realloc(iqsync->filters, sizeof(*iqsync->filters) * iqsync->filter_count); + iqsync->filters[iqsync->filter_count - 1] = (iqsync_filter_t) { + .filter_setup = setup, + }; + + free(filter_arg); + + return 0; +} static int -iqsync_setup_ssh( +iqsync_start_remote( iqsync_t * const iqsync, - const char * remote_name + const char * remote_name, + bool is_tcp, + int fds[], + pid_t *pchild ) { - // If we are in srever mode, everything is setup - if (iqsync->do_server) - return 0; - // Get remote host:file from argv iqsync->remote.name = remote_name; if (!iqsync->remote.name) - TSABORTX("Remote iqueue must be specified"); + TSABORTX("Remote iqueue must be specified"); char * remote_host = strdup(iqsync->remote.name); + if (!remote_host) abort(); char * remote_file = index(remote_host, ':'); if (!remote_file) - TSABORTX("Unable to parse remote iqueue name '%s'", remote_host); + TSABORTX("Unable to parse remote iqueue name '%s'", remote_host); *(remote_file++) = '\0'; // Check for an env variable for the iqsync command const char * remote_cmd = getenv("IQSYNC_CMD"); if (!remote_cmd) - remote_cmd = "iqsync"; + remote_cmd = "iqsync"; const char * ssh_cmd = getenv("IQSYNC_RSH"); if (!ssh_cmd) - ssh_cmd = "/usr/bin/ssh"; - + ssh_cmd = "/usr/bin/ssh"; // Determine my name to pass as the remote hostname char my_name[1024]; - if (gethostname(my_name, sizeof(my_name)) < 0) - TSABORT("Unable to get my hostname"); - int name_len = strlen(my_name); - my_name[name_len++] = ':'; - strncpy(my_name + name_len, iqsync->local.name, sizeof(my_name) - name_len); + if (is_tcp) { + strncpy(my_name, "0", sizeof(my_name)); + } else { + if (gethostname(my_name, sizeof(my_name)) < 0) + TSABORT("Unable to get my hostname"); + int name_len = strlen(my_name); + my_name[name_len++] = ':'; + strncpy(my_name + name_len, iqsync->local.name, sizeof(my_name) - name_len); + } char usleep_str[16]; snprintf(usleep_str, sizeof(usleep_str), "%d", iqsync->usleep_time); char rate_limit_str[16]; - snprintf(rate_limit_str, sizeof(rate_limit_str), "%"PRIu64, iqsync->rate_limit); + snprintf(rate_limit_str, sizeof(rate_limit_str), + "%"PRIu64, + iqsync->rate_limit); + + char connection_timeout_str[16]; + snprintf(connection_timeout_str, sizeof(connection_timeout_str), + "%"PRIu64, + iqsync->connection_timeout_sec); + + char sendbuffer_size_str[16]; + snprintf(sendbuffer_size_str, sizeof(sendbuffer_size_str), + "%"PRIu32, + (iqsync->sendbuffer_len ? iqsync->sendbuffer_len : DEFAULT_SENDBUFFER_SIZE)); + char recvbuffer_size_str[16]; + snprintf(recvbuffer_size_str, sizeof(recvbuffer_size_str), + "%"PRIu32, + (iqsync->recvbuffer_len ? iqsync->recvbuffer_len : DEFAULT_RECVBUFFER_SIZE)); // Redirect stdin/stdout, but let it write to our stderr - int fds[3]; pid_t child = tsio_open3( - fds, - TSIO_STDIN_MASK | TSIO_STDOUT_MASK, - ssh_cmd, - (const char *[]) { - ssh_cmd, - remote_host, - remote_cmd, - "--server", my_name, - "--type", "ssh", - "-f", - remote_file, - "--sleep", usleep_str, - "--rate-limit", rate_limit_str, - iqsync->verbose ? "--verbose" : "--nop", - iqsync->do_push ? "--pull" : "--nop", // note reversed sense - iqsync->do_pull ? "--push" : "--nop", // note reversed sense - iqsync->do_tail ? "--tail" : "--nop", - iqsync->remote_cpu ? "--cpu" : "--nop", - iqsync->remote_cpu ? iqsync->remote_cpu : "--nop", - 0 - } + fds, + TSIO_STDIN_MASK | TSIO_STDOUT_MASK, + ssh_cmd, + (const char *[]) { + ssh_cmd, + remote_host, + remote_cmd, + "--server", my_name, + "--type", is_tcp ? "tcp" : "ssh", + "-f", + remote_file, + "--sleep", usleep_str, + "--rate-limit", rate_limit_str, + "--connection-timeout", connection_timeout_str, + "--send-buffer-size", sendbuffer_size_str, + "--recv-buffer-size", recvbuffer_size_str, + iqsync->verbose ? "--verbose" : "--nop", + iqsync->quiet ? "--quiet" : "--nop", + iqsync->do_push ? ( iqsync->do_clone_push ? "--clone" : "--pull" ) : "--nop", // note reversed sense + iqsync->do_pull ? "--push" : "--nop", // note reversed sense + iqsync->do_tail ? "--tail" : "--nop", + iqsync->do_hdr_validate ? "--validate" : "--nop", + iqsync->remote_cpu ? "--cpu" : "--nop", + iqsync->remote_cpu ? iqsync->remote_cpu : "--nop", + is_tcp ? "--launched-by-client" : "--nop", + iqsync->use_sendbuffer ? "--send-buffer" : "--nop", + iqsync->use_recvbuffer ? "--recv-buffer" : "--nop", + 0 + } ); if (child < 0) - TSABORTX("Unable to fork %s", ssh_cmd); + TSABORTX("Unable to fork %s", ssh_cmd); + + if (pchild) + *pchild = child; + + free(remote_host); + return 0; +} + +static int +iqsync_setup_ssh( + iqsync_t * const iqsync, + const char * remote_name +) +{ + // If we are in srever mode, everything is setup + if (iqsync->do_server) + return 0; + + int fds[3]; + if (iqsync_start_remote(iqsync, remote_name, false, fds, NULL) < 0) { + return -1; + } iqsync->read_fd = fds[1]; iqsync->write_fd = fds[0]; @@ -213,11 +337,94 @@ iqsync_setup_ssh( return 0; } +static int +iqsync_setup_tcp_both_side( + iqsync_t * const iqsync, + const char * const remote_name) +{ + pid_t child = -1; + int fds[3]; + if (iqsync_start_remote(iqsync, remote_name, true, fds, &child) < 0) { + return -1; + } + + uint16_t nport = 0; + char * ptr = (char *)&nport; + size_t offset = 0; + int64_t timeout_nanos = 0; + if (iqsync->connection_timeout_sec > 0) + timeout_nanos = tsclock_getnanos(0) + iqsync->connection_timeout_sec * NANOS_IN_SECOND; + + int flags = fcntl(fds[1], F_GETFL, 0); + if (fcntl(fds[1], F_SETFL, flags | O_NONBLOCK)) + TSABORTX("Unable to set nonblocking flag"); + + while (offset < sizeof(nport)) { + + if (timeout_nanos > 0 && tsclock_getnanos(0) > timeout_nanos) { + TSLOGX(TSERROR, "Timed out while waiting for port number from the server"); + return -1; + } + + ssize_t rlen = read(fds[1], ptr + offset, sizeof(nport) - offset); + if (rlen < 0) { + if (errno == EAGAIN) + continue; + + TSLOGL(TSERROR, "failed to receive port number from the server"); + waitpid(child, NULL, 0); + return -1; + } + + offset += rlen; + } + + uint16_t server_port = be16toh(nport); + char server_port_str[8]; + snprintf(server_port_str, sizeof(server_port_str), "%"PRIu16, server_port); + + char * remote_host = strdup(remote_name); + if (!remote_host) abort(); + char * remote_file = index(remote_host, ':'); + if (!remote_file) + TSABORTX("Unable to parse remote iqueue name '%s'", remote_host); + *(remote_file++) = '\0'; + + const char *host_name = remote_host; + if (index(remote_host, '@') != NULL) + host_name = index(remote_host, '@') + 1; + + if (iqsync->connection_timeout_sec > 0) + timeout_nanos = tsclock_getnanos(0) + iqsync->connection_timeout_sec * NANOS_IN_SECOND; + + while (1) + { + int fd = tsnet_tcp_client_socket(host_name, server_port_str, 0); + if (fd < 0) { + if (timeout_nanos > 0 && tsclock_getnanos(0) > timeout_nanos) + { + TSLOGL(TSERROR, + "Timed out while trying to connect to %s:%s", + host_name, server_port_str); + waitpid(child, NULL, 0); + return -1; + } + continue; + } + + iqsync->read_fd = fd; + iqsync->write_fd = fd; + + free(remote_host); + return 0; + } +} static int iqsync_setup_tcp( iqsync_t * const iqsync, - const char * const remote_name + const char * const remote_name, + const bool launched_by_client ) { static char default_port[] = "20809"; @@ -225,33 +432,34 @@ iqsync_setup_tcp( if (!iqsync->do_server) { - // client connects make a TCP socket and are done. - char * name = strdup(remote_name); - char * port = strchr(name, ':'); - if (port) - *port++ = '\0'; - else - port = default_port; - - TSLOGX(TSINFO, "%s: Connecting to %s:%s", - iqsync->local.name, - name, - port - ); - - int fd = tsnet_tcp_client_socket(name, port, 0); - if (fd < 0) - { - TSLOG(TSERROR, "Unable to connect to %s:%s", name, port); - return -1; - } - - iqsync->read_fd = fd; - iqsync->write_fd = fd; - iqsync->remote.name = name; - return 0; + // client connects make a TCP socket and are done. + char * name = strdup(remote_name); + char * port = strchr(name, ':'); + if (port) + *port++ = '\0'; + else + port = default_port; + + TSLOGXL(TSINFO, "%s: Connecting to %s:%s", + iqsync->local.name, + name, + port + ); + + int fd = tsnet_tcp_client_socket(name, port, 0); + if (fd < 0) + { + TSLOGL(TSERROR, "Unable to connect to %s:%s", name, port); + return -1; + } + + iqsync->read_fd = fd; + iqsync->write_fd = fd; + iqsync->remote.name = name; + return 0; } + TSLOGX(TSINFO, "opening iqueue %s", iqsync->local.name); // Make sure the parameters are correct; can not do a clone into // a non-existant iqueue // only get write access if we are pulling i @@ -259,8 +467,8 @@ iqsync_setup_tcp( iqsync->iq = iqueue_open(iqsync->local.name, writable); if (!iqsync->iq) { - TSLOGX(TSERROR, "%s: Unbale to open", iqsync->local.name); - return -1; + TSLOGXL(TSERROR, "%s: Unable to open", iqsync->local.name); + return -1; } // Use the remote server name for [[IP:]PORT] to bind to @@ -270,175 +478,272 @@ iqsync_setup_tcp( char * orig_name = strdup(iqsync->remote.name); if (iqsync->remote.name[0] != '\0') { - char * colon = strchr(orig_name, ':'); - if (!colon) - port_name = orig_name; - else { - *port_name++ = '\0'; - server_name = orig_name; - } + char * colon = strchr(orig_name, ':'); + if (!colon) + port_name = orig_name; + else { + *port_name++ = '\0'; + server_name = orig_name; + } } int server_fd = tsnet_tcp_server_socket(server_name, port_name, 0); if (server_fd < 0) - TSABORT("tcp bind to %s:%s", server_name, port_name); + TSABORT("tcp bind to %s:%s", server_name, port_name); + + struct sockaddr_in server_addr; + socklen_t server_len = sizeof(server_addr); + if (getsockname(server_fd, &server_addr, &server_len) < 0) + TSABORT("unable to getsockname"); + + uint16_t server_port = be16toh(server_addr.sin_port); - TSLOGX(TSINFO, - "%s: Waiting for inbound connections on TCP port %s:%s", - iqsync->local.name, - server_name, - port_name + TSLOGXL(TSINFO, + "%s: Waiting for inbound connections on TCP port %s:%"PRIu16, + iqsync->local.name, + server_name, + server_port ); + if (launched_by_client) { + // send port number to client + uint16_t nport = htobe16(server_port); + ssize_t wlen = write(STDOUT_FILENO, &nport, sizeof(nport)); + if (wlen < 0 || wlen != (ssize_t)sizeof(nport)) + TSABORT("failed to write to stdout"); + + fsync(STDOUT_FILENO); + } + while (1) { - struct sockaddr_in remote_addr; - socklen_t remote_len = sizeof(remote_addr); - int fd = accept(server_fd, &remote_addr, &remote_len); - if (fd < 0) - TSABORT("accept %s:%s", server_name, port_name); - - // Duplicate the iqsync_t for the new connection - iqsync_t * const new_iqsync = calloc(1, sizeof(*new_iqsync)); - if (!new_iqsync) - TSABORT("unable to allocate iqsync object"); - *new_iqsync = *iqsync; - - char client_name[256]; - snprintf(client_name, sizeof(client_name), "%s:%d", - inet_ntoa(remote_addr.sin_addr), - ntohs(remote_addr.sin_port) - ); - new_iqsync->remote.name = strdup(client_name); - - TSLOGX(TSINFO, "%s: Connected to %s", - new_iqsync->local.name, - new_iqsync->remote.name - ); - - new_iqsync->read_fd = fd; - new_iqsync->write_fd = fd; - - if (iqsync_start(new_iqsync) < 0) - return -1; - - // Detatch from the threads so that they will exit cleanly - // This will leak the new_iqsync object, but that is ok for now. - pthread_detach(new_iqsync->push_thread); - pthread_detach(new_iqsync->pull_thread); - pthread_detach(new_iqsync->stat_thread); + if (launched_by_client && iqsync->connection_timeout_sec) { + struct timeval timeout = { + .tv_sec = iqsync->connection_timeout_sec, + .tv_usec = 0 + }; + + fd_set readfds; + FD_ZERO(&readfds); + FD_SET(server_fd, &readfds); + int rc = select(server_fd + 1, &readfds, NULL, NULL, &timeout); + if (rc < 0) { + TSABORT("%s: Select failed", iqsync->local.name); + } else if (rc == 0) { + TSABORTX("%s: Timed out while waiting for connection", iqsync->local.name); + } + } + + struct sockaddr_in remote_addr; + socklen_t remote_len = sizeof(remote_addr); + int fd = accept(server_fd, &remote_addr, &remote_len); + if (fd < 0) + TSABORT("accept %s:%"PRIu16, server_name, server_port); + + // Duplicate the iqsync_t for the new connection + iqsync_t * const new_iqsync = calloc(1, sizeof(*new_iqsync)); + if (!new_iqsync) + TSABORT("unable to allocate iqsync object"); + *new_iqsync = *iqsync; + + char client_name[256]; + snprintf(client_name, sizeof(client_name), "%s:%d", + inet_ntoa(remote_addr.sin_addr), + ntohs(remote_addr.sin_port) + ); + new_iqsync->remote.name = strdup(client_name); + + TSLOGXL(TSINFO, "%s: Connected to %s", + new_iqsync->local.name, + new_iqsync->remote.name + ); + + new_iqsync->read_fd = fd; + new_iqsync->write_fd = fd; + + if (iqsync_start(new_iqsync) < 0) + return -1; + + if (launched_by_client) { + if (iqsync_wait(new_iqsync) < 0) + TSABORTX("iqsync_wait failed"); + + break; + } else { + // Detatch from the threads so that they will exit cleanly + // This will leak the new_iqsync object, but that is ok for now. + pthread_detach(new_iqsync->push_thread); + pthread_detach(new_iqsync->pull_thread); + pthread_detach(new_iqsync->stat_thread); + } } - // Unreachable + TSLOGXL(TSINFO, "exiting success"); free(orig_name); close(server_fd); exit(EXIT_SUCCESS); } - int main( int argc, char **argv ) { - //segfault_handler_install(); + signal(SIGPIPE, SIG_IGN); + iqsync_t * const iqsync = calloc(1, sizeof(*iqsync)); + if (!iqsync) + TSABORT("alloc failed"); *iqsync = (iqsync_t) { - .report_interval = 600, - .usleep_time = 100, - .heartbeats_lock = tslock_alloc(), - .read_fd = STDIN_FILENO, - .write_fd = STDOUT_FILENO, + .report_interval = 600, + .usleep_time = 100, + .heartbeats_lock = tslock_alloc(), + .read_fd = STDIN_FILENO, + .write_fd = STDOUT_FILENO, + + .connection_timeout_sec = 120, }; + if (!iqsync->heartbeats_lock) + TSABORT("lock alloc failed"); int option_index = 0; - const char * usleep_time_str = "0"; const char * transport_type = "ssh"; bool prefetch = false; bool syncbehind = false; + bool launched_by_client = false; + bool launch_server = false; while (1) { - int c = getopt_long( - argc, - argv, - "h?f:ts:pvVZR:Cr:T:m:c:K:eb", - long_options, - &option_index - ); - - if (c == -1) - break; - - switch (c) - { - case 0: break; - default: usage(stderr, ""); break; - case 'h': case '?': usage(stdout, ""); break; - - case 'Z': break; // nop - case 'C': - iqsync->do_clone = 1; - iqsync->do_hdr_validate = 1; - iqsync->do_pull = 1; - break; - case 'f': iqsync->local.name = optarg; break; - case 't': iqsync->do_tail = 1; break; - case 'c': iqsync->local_cpu = optarg; break; - case 'K': iqsync->remote_cpu = optarg; break; - case 'M': iqsync->rate_limit = strtoul(optarg, NULL, 0); break; - case 'T': transport_type = optarg; break; - case 'p': iqsync->do_push = 1; break; - case 'P': iqsync->do_pull = 1; break; - case 'V': iqsync->do_hdr_validate = 1; break; - case 'v': iqsync->verbose++; tslevel = TSDEBUG; break; - case 'r': iqsync->report_interval = strtoul(optarg, NULL, 0); break; - case 'R': - iqsync->do_server = 1; - iqsync->remote.name = optarg; - break; - case 's': - usleep_time_str = optarg; - iqsync->usleep_time = strtoul(optarg, 0, 0); - break; - case 'e': - prefetch = true; - break; - case 'b': - syncbehind = true; + int c = getopt_long( + argc, + argv, + "h?f:ts:pvVZR:Cr:T:m:c:K:ebOq", + long_options, + &option_index + ); + + if (c == -1) + break; + + switch (c) + { + case 0: break; + default: usage(stderr, ""); break; + case 'h': case '?': usage(stdout, ""); break; + + case 'Z': break; // nop + case 'C': + iqsync->do_clone = 1; + iqsync->do_hdr_validate = 1; + iqsync->do_pull = 1; + break; + case 'O': + iqsync->do_clone_push = 1; + iqsync->do_hdr_validate = 1; + iqsync->do_push = 1; + break; + case 'f': iqsync->local.name = optarg; break; + case 't': iqsync->do_tail = 1; break; + case 'c': iqsync->local_cpu = optarg; break; + case 'K': iqsync->remote_cpu = optarg; break; + case 'M': iqsync->rate_limit = strtoul(optarg, NULL, 0); break; + case 'T': transport_type = optarg; break; + case 'p': iqsync->do_push = 1; break; + case 'P': iqsync->do_pull = 1; break; + case 'V': iqsync->do_hdr_validate = 1; break; + case 'v': iqsync->verbose++; tslevel = TSDEBUG; break; + case 'q': iqsync->quiet++; tslevel = TSWARN; break; + case 'r': iqsync->report_interval = strtoul(optarg, NULL, 0); break; + case 'R': + iqsync->do_server = 1; + iqsync->remote.name = optarg; + break; + case 's': + iqsync->usleep_time = strtoul(optarg, NULL, 0); + break; + case 'e': + prefetch = true; + break; + case 'b': + syncbehind = true; + break; + case 'F': { + if (iqsync_setup_filter(iqsync, optarg) < 0) + usage(stderr, "failed to parse filter argument"); break; } + case 1: + launch_server = true; + break; + case 2: + iqsync->connection_timeout_sec = strtoul(optarg, NULL, 0); + break; + case 3: + launched_by_client = true; + break; + case 4: + iqsync->use_sendbuffer = 1; + break; + case 5: + iqsync->sendbuffer_len = strtoul(optarg, NULL, 0); + if (iqsync->sendbuffer_len == 0) + usage(stderr, "Invalid sendbuffer length"); + else if (ceilintpow2(iqsync->sendbuffer_len) != iqsync->sendbuffer_len) + usage(stderr, "sendbuffer length must be power of 2"); + break; + case 6: + iqsync->use_recvbuffer = 1; + break; + case 7: + iqsync->recvbuffer_len = strtoul(optarg, NULL, 0); + if (iqsync->recvbuffer_len == 0) + usage(stderr, "Invalid recvbuffer length"); + else if (ceilintpow2(iqsync->recvbuffer_len) != iqsync->recvbuffer_len) + usage(stderr, "recvbuffer length must be power of 2"); + break; + } } + if (iqsync->verbose && iqsync->quiet) + usage(stderr, "Quiet and verbose modes cannot be set concurrently!\n"); + if (!iqsync->local.name) - usage(stderr, "iqueue file must be specified!\n"); + usage(stderr, "iqueue file must be specified!\n"); if (!iqsync->do_push && !iqsync->do_pull) - usage(stderr, "At least one of --push / --pull must be specified!\n"); + usage(stderr, "At least one of --push / --pull must be specified!\n"); if (strcmp(transport_type, "ssh") == 0) { - if (iqsync_setup_ssh(iqsync, argv[optind]) < 0) - return -1; + if (iqsync_setup_ssh(iqsync, argv[optind]) < 0) + return -1; } else if (strcmp(transport_type, "tcp") == 0) { - if (iqsync_setup_tcp(iqsync, argv[optind]) < 0) - return -1; + if (!iqsync->do_server && launch_server) { + if (iqsync_setup_tcp_both_side(iqsync, argv[optind]) < 0) + return -1; + } else { + if (iqsync_setup_tcp( + iqsync, argv[optind], launched_by_client) < 0) + return -1; + } } else - usage(stderr, "Unknown --type option!\n"); + usage(stderr, "Unknown --type option!\n"); iqsync->do_prefetch = prefetch; iqsync->do_syncbehind = syncbehind; // All configured. Start the threads if (iqsync_start(iqsync) < 0) - return -1; + TSABORTX("iqsync_start failed"); if (iqsync_wait(iqsync) < 0) - return -1; + TSABORTX("iqsync_wait failed"); + TSLOGXL(TSINFO, "exiting success"); return 0; } diff --git a/src/iqsync.c b/src/iqsync.c index d399b77..b303620 100644 --- a/src/iqsync.c +++ b/src/iqsync.c @@ -1,16 +1,22 @@ -/* $TwoSigma: iqsync.c,v 1.24 2012/02/07 13:37:40 thudson Exp $ */ - /* - * Copyright (c) 2010 Two Sigma Investments, LLC - * All Rights Reserved + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF - * Two Sigma Investments, LLC. + * http://www.apache.org/licenses/LICENSE-2.0 * - * The copyright notice above does not evidence any - * actual or intended publication of such source code. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "twosigma.h" +#include +#include +#include #include #include #include @@ -20,17 +26,16 @@ #include #include #include -#include "bswap.h" -#include "tsutil.h" -#include "tslock.h" -#include "tsio.h" +#include +#include "container_of.h" +#include "tsassert.h" +#include "tslog.h" +#include "io_utils.h" #include "tsclock.h" -#include "tssched.h" #include "iqueue.h" #include "iqsync.h" -#include "segfault.h" - -__RCSID("$TwoSigma: iqsync.c,v 1.24 2012/02/07 13:37:40 thudson Exp $"); +#include "math_utils.h" +#include "tslock.h" /** \file * Core iqsync algorithm and threads. @@ -52,21 +57,21 @@ iqsync_stats( const uint64_t report_delta = now - iqsync->report_time; if (iqsync->do_pull) - TSLOGX(TSINFO, "%s: RX index %"PRId64": %"PRIu64" messages %.3f kpps, %.2f MB/s", - iqsync->remote.name, - iqsync->remote.index, - rx_count, - (rx_count - iqsync->report_rx_count) * 1.0e6 / report_delta, - (rx_len - iqsync->report_rx_len) * 1.0e3 / report_delta + TSLOGXL(TSINFO, "%s: RX index %"PRId64": %"PRIu64" messages %.3f kpps, %.2f MB/s", + iqsync->remote.name, + iqsync->remote.index, + rx_count, + (rx_count - iqsync->report_rx_count) * 1.0e6 / report_delta, + (rx_len - iqsync->report_rx_len) * 1.0e3 / report_delta ); if (iqsync->do_push) - TSLOGX(TSINFO, "%s: TX index %"PRId64": %"PRIu64" messages %.3f kpps, avg size %"PRIu64" bytes", - iqsync->remote.name, - iqsync->local.index, - tx_count, - (tx_count - iqsync->report_tx_count) * 1.0e6 / report_delta, - iqsync->avg_msg_len + TSLOGXL(TSINFO, "%s: TX index %"PRId64": %"PRIu64" messages %.3f kpps, avg size %"PRIu64" bytes", + iqsync->remote.name, + iqsync->local.index, + tx_count, + (tx_count - iqsync->report_tx_count) * 1.0e6 / report_delta, + iqsync->avg_msg_len ); iqsync->report_rx_count = rx_count; @@ -83,18 +88,39 @@ iqsync_stat_thread( { iqsync_t * const iqsync = iqsync_ptr; if (iqsync->report_interval == 0) - return NULL; + return NULL; - while (!iqsync->do_shutdown) - { - sleep(iqsync->report_interval); - iqsync_stats(iqsync); + pthread_mutex_lock(&iqsync->stats_shutdown_mutex); + + while (!iqsync->do_shutdown) { + struct timespec waittime; + clock_gettime(CLOCK_REALTIME, &waittime); + waittime.tv_sec += iqsync->report_interval; + + pthread_cond_timedwait(&iqsync->stats_shutdown_cond, + &iqsync->stats_shutdown_mutex, &waittime); + if (iqsync->do_shutdown) + break; + + iqsync_stats(iqsync); } + pthread_mutex_unlock(&iqsync->stats_shutdown_mutex); return NULL; } +static inline uint64_t +iqsync_block_index(uint64_t idx, uint64_t block_shift) +{ + return (idx >> block_shift); +} + +static inline uint64_t +iqsync_block_offset(uint64_t idx, uint64_t offset_mask) +{ + return (idx & offset_mask); +} /** Return a pointer to the containing iqsync_msg_t if the * message in the iqueue is of type iqsync_msg. Returns NULL @@ -108,12 +134,12 @@ iqsync_data_msg( { const void * const data = iqueue_get_data(iq, offset, 1); if (!data) - return NULL; + return NULL; const struct iqsync_data * const msg = container_of( - (void*)(uintptr_t) data, - const struct iqsync_data, - data + (void*)(uintptr_t) data, + const struct iqsync_data, + data ); // Make sure that the offset does not cross the front of @@ -122,7 +148,7 @@ iqsync_data_msg( // to be sure that the msg header is intact. if ((offset & IQUEUE_BLOCK_MASK) < offsetof(struct iqsync_data, data) || msg->magic != htobe64(IQSYNC_DATA_MAGIC)) - return NULL; + return NULL; // It appears to be a valid iqsync message. return msg; @@ -142,18 +168,18 @@ iqsync_sources_setup( if (!sh) { - TSLOGX(TSERROR, "%s: Unable to create iqsync sources table", - iqueue_name(iq) - ); - return -1; + TSLOGXL(TSERROR, "%s: Unable to create iqsync sources table", + iqueue_name(iq) + ); + return -1; } iqsync->sources = shash_copy(sh); iqsync->scan_index = shash_insert_or_get(iqsync->sources, -1, 0); - TSLOGX(TSINFO, "%s: Sources table skipping to index %"PRIu64, - iqueue_name(iq), - iqsync->scan_index->value + TSLOGXL(TSINFO, "%s: Sources table skipping to index %"PRIu64, + iqueue_name(iq), + iqsync->scan_index->value ); return 0; @@ -171,20 +197,20 @@ iqsync_hash_update( if (!source) { - // Writer does not yet exist; create a new one. - // If this succeeds, the source id not yet exist and the newly - // created one will have the value that we provided. - source = shash_insert(sh, src_id, src_index); - - // If it did not succeed, then we raced with another thread to - // create this entry in the hash and must follow the update - // protocol by retrieving the existing one - if (!source) - source = shash_get(sh, src_id); - - // If there is still no source, the iqueue is corrupted - if (!source) - TSABORTX("corrupt iqueue? bad source behaviour"); + // Writer does not yet exist; create a new one. + // If this succeeds, the source id not yet exist and the newly + // created one will have the value that we provided. + source = shash_insert(sh, src_id, src_index); + + // If it did not succeed, then we raced with another thread to + // create this entry in the hash and must follow the update + // protocol by retrieving the existing one + if (!source) + source = shash_get(sh, src_id); + + // If there is still no source, the iqueue is corrupted + if (!source) + TSABORTX("corrupt iqueue? bad source behaviour for source id %"PRIu64, src_id); } return iqueue_writer_update(sh, source, src_index); @@ -201,12 +227,12 @@ iqsync_sources_get( ) { if (src_id == 0) - return 0; + return 0; shash_entry_t * const source = shash_insert_or_get( - iqsync->sources, - src_id, - 0 + iqsync->sources, + src_id, + 0 ); return source->value; @@ -255,63 +281,68 @@ iqsync_sources_scan_all( while (1) { - size_t len; - const uint64_t offset = iqueue_offset(iq, scan_index, &len); - - // At the end of the queue? We're done scanning. - if (offset == (uint64_t) -1) - break; - - // There is a new message; update our iterator - scan_index++; - - // If the scanned slot does not contain an iqsync message, - // we can ignore it and move on to the next slot - const struct iqsync_data * const old_msg = iqsync_data_msg(iq, offset); - if (!old_msg) - continue; - - // Update the hash table for this original source and the iqueue - // through which it might have been routed. - // If this races with other iqsyncs, the highest value will win. - // The update does not need to do an atomic in the case were another - // iqsync has already updated the table. - const uint64_t orig_src = be64toh(old_msg->orig_src); - const uint64_t orig_index = be64toh(old_msg->orig_index); - const uint64_t route_src = be64toh(old_msg->src); - const uint64_t route_index = be64toh(old_msg->iq_index); - - // If the source and routed sources are the same, the indices - // had better agree. Otherwise something has gone horribly wrong - // in the protocol. - if (orig_src == route_src - && orig_index != route_index) - TSABORTX("%s: iqsync protocol error!" - " original source %"PRIu64".%"PRIu64 - " != routed %"PRIu64".%"PRIu64 - "!", - iqsync->remote.name, - orig_src, - orig_index, - route_src, - route_index - ); - - // Update the original and routing iqueue indices in the shared table - // The value recorded is the next expected index, not the most - // recently seen index. This allows the value to start at 0, - // rather than -1. - iqsync_hash_update(iqsync->sources, orig_src, orig_index + 1); - if (orig_src != route_src) - iqsync_hash_update(iqsync->sources, route_src, route_index + 1); - - // If the caller has specified a source that matches either id, - // update the tell them the index for that source - if (src_id == orig_src) - src_index = orig_index + 1; - else - if (src_id == route_src) - src_index = route_index + 1; + size_t len; + const uint64_t offset = iqueue_offset(iq, scan_index, &len); + + // At the end of the queue? We're done scanning. + if (offset == (uint64_t) -1) + break; + + // There is a new message; update our iterator + scan_index++; + + // If the scanned slot does not contain an iqsync message, + // we can ignore it and move on to the next slot + const struct iqsync_data * const old_msg = iqsync_data_msg(iq, offset); + if (!old_msg) + continue; + + // Update the hash table for this original source and the iqueue + // through which it might have been routed. + // If this races with other iqsyncs, the highest value will win. + // The update does not need to do an atomic in the case were another + // iqsync has already updated the table. + const uint64_t orig_src = be64toh(old_msg->orig_src); + const uint64_t orig_index = be64toh(old_msg->orig_index); + const uint64_t route_src = be64toh(old_msg->src); + const uint64_t route_index = be64toh(old_msg->iq_index); + + if (orig_src == 0 || route_src == 0) + TSABORTX("Bad source ids! orig_src: %"PRIu64" orig_index: %"PRIu64 + " route_src: %"PRIu64" route_index: %"PRIu64" scan_index: %"PRIu64, + orig_src, orig_index, route_src, route_index, scan_index - 1); + + // If the source and routed sources are the same, the indices + // had better agree. Otherwise something has gone horribly wrong + // in the protocol. + if (orig_src == route_src + && orig_index != route_index) + TSABORTX("%s: iqsync protocol error!" + " original source %"PRIu64".%"PRIu64 + " != routed %"PRIu64".%"PRIu64 + "!", + iqsync->remote.name, + orig_src, + orig_index, + route_src, + route_index + ); + + // Update the original and routing iqueue indices in the shared table + // The value recorded is the next expected index, not the most + // recently seen index. This allows the value to start at 0, + // rather than -1. + iqsync_hash_update(iqsync->sources, orig_src, orig_index + 1); + if (orig_src != route_src) + iqsync_hash_update(iqsync->sources, route_src, route_index + 1); + + // If the caller has specified a source that matches either id, + // update the tell them the index for that source + if (src_id == orig_src) + src_index = orig_index + 1; + else + if (src_id == route_src) + src_index = route_index + 1; } // We've hit the end of the iqueue. Update the tail pointer in @@ -321,28 +352,28 @@ iqsync_sources_scan_all( // since the scan_index is only maintained with loose consistency. const uint64_t cur_scan_index = iqsync->scan_index->value; if (cur_scan_index < scan_index) - shash_update( - iqsync->sources, - iqsync->scan_index, - cur_scan_index, - scan_index - ); + shash_update( + iqsync->sources, + iqsync->scan_index, + cur_scan_index, + scan_index + ); // Indicate to the caller the src_index of the last message from // src_id. if (src_index_out) - *src_index_out = src_index; + *src_index_out = src_index; // And return how far we have scanned through the iqueue. // If there is no race, this will be the first empty element of // the iqueue, which is where it will append the incoming message. if (orig_scan_index != scan_index) - TSLOGX(TSDEBUG, "%s: Scanned from %"PRIu64" to %"PRIu64, - iqsync->local.name, - orig_scan_index, - scan_index - ); + TSLOGXL(TSDEBUG, "%s: Scanned from %"PRIu64" to %"PRIu64, + iqsync->local.name, + orig_scan_index, + scan_index + ); return scan_index; } @@ -366,38 +397,164 @@ iqsync_sources_scan( { // One of our own? if (cur_src == iqsync->local.creation) - goto discard; + goto discard; // Refresh to the end of the iqueue and retrieve the // latest message index from the current source. uint64_t src_index; const uint64_t scan_index = iqsync_sources_scan_all( - iqsync, - cur_src, - &src_index + iqsync, + cur_src, + &src_index ); // So far, so good. If the incoming message is the next expected // one, then we're done. if (cur_index >= src_index) - return scan_index; + return scan_index; // The incoming message has already been seen. // Signal that it should be discarded. discard: if (iqsync->verbose || iqsync->warned_cycle == 0) { - iqsync->warned_cycle = 1; - TSLOGX(TSINFO, "%s: Discarding %"PRIu64".%"PRIu64, - iqsync->remote.name, - cur_src, - cur_index - ); + iqsync->warned_cycle = 1; + TSLOGXL(TSINFO, "%s: Discarding %"PRIu64".%"PRIu64, + iqsync->remote.name, + cur_src, + cur_index + ); } return IQUEUE_MSG_BAD_ID; } +/** Read data from the read_fd, checking for do_shutdown */ +static int +iqsync_read( + iqsync_t * const iqsync, + void *buf_ptr, + size_t len) +{ + uint8_t * buf = buf_ptr; + + fd_set readfds; + fd_set exceptfds; + + FD_ZERO(&readfds); + FD_ZERO(&exceptfds); + + if (iqsync->recvbuffer) { + while (iqsync->recvbuffer_write_idx - iqsync->recvbuffer_read_idx < len) { + if (iqsync->do_shutdown) { + return 0; + } + + struct timeval timeout = { + .tv_sec = 1, + .tv_usec = 0 + }; + + FD_SET(iqsync->read_fd, &readfds); + FD_SET(iqsync->read_fd, &exceptfds); + + int rc = select(iqsync->read_fd + 1, + &readfds, NULL, &exceptfds, &timeout); + if (rc == 0) + continue; + else if (rc < 0) { + TSLOGXL(TSERROR, "%s: Select failed", iqsync->remote.name); + return -1; + } + + uint64_t read_block_idx = iqsync_block_index( + iqsync->recvbuffer_read_idx, iqsync->recvbuffer_block_shift); + uint64_t write_block_index = iqsync_block_index( + iqsync->recvbuffer_write_idx, iqsync->recvbuffer_block_shift); + uint64_t write_block_offset = iqsync_block_offset( + iqsync->recvbuffer_write_idx, iqsync->recvbuffer_offset_mask); + + size_t max_read = + read_block_idx == write_block_index ? + iqsync->recvbuffer_len - write_block_offset : + iqsync->recvbuffer_read_idx + iqsync->recvbuffer_len - iqsync->recvbuffer_write_idx; + + ssize_t rlen = read( + iqsync->read_fd, + iqsync->recvbuffer + write_block_offset, + max_read); + if (rlen < 0) { + // This should be impossible since the select said there was data + assert(errno != EAGAIN); + + if (errno == EINTR) + continue; + + return -1; + } + + iqsync->recvbuffer_write_idx += rlen; + } + + uint32_t read_block_offset = iqsync_block_offset( + iqsync->recvbuffer_read_idx, iqsync->recvbuffer_offset_mask); + + if (read_block_offset + len <= iqsync->recvbuffer_len) { + memcpy(buf, iqsync->recvbuffer + read_block_offset, len); + } else { + // message spans over the boundary + size_t part_len = iqsync->recvbuffer_len - read_block_offset; + memcpy(buf, iqsync->recvbuffer + read_block_offset, part_len); + memcpy(buf + part_len, iqsync->recvbuffer, len - part_len); + } + iqsync->recvbuffer_read_idx += len; + return len; + } else { + size_t offset = 0; + + while (1) + { + if (iqsync->do_shutdown) + return 0; + + struct timeval timeout = { + .tv_sec = 1, + .tv_usec = 0 + }; + + FD_SET(iqsync->read_fd, &readfds); + FD_SET(iqsync->read_fd, &exceptfds); + + int rc = select(iqsync->read_fd + 1, + &readfds, NULL, &exceptfds, &timeout); + if (rc == 0) + continue; + else if (rc < 0) { + TSLOGXL(TSERROR, "%s: Select failed", iqsync->remote.name); + return -1; + } + + ssize_t rlen = read(iqsync->read_fd, buf, len); + if (rlen == (ssize_t) len || rlen == 0) + return offset + rlen; + + if (rlen < 0) + { + // This should be impossible since the select said there was data + assert(errno != EAGAIN); + + if (errno == EINTR) + continue; + + return -1; + } + + offset += rlen; + buf += rlen; + len -= rlen; + } + } +} static int iqsync_start_recv( @@ -408,49 +565,66 @@ iqsync_start_recv( // Read where they want us to start struct iqsync_start start; - if (tsio_read_all( - iqsync->read_fd, - &start, - sizeof(start) + if (iqsync_read( + iqsync, + &start, + sizeof(start) ) != sizeof(start)) { - TSLOGX(TSERROR, "%s: Start message read failed", iqsync->remote.name); - return -1; + TSLOGXL(TSERROR, "%s: Start message read failed", iqsync->remote.name); + return -1; } const uint64_t remote_magic = be64toh(start.magic); if (remote_magic != IQSYNC_START_MAGIC) { - TSLOGX(TSERROR, "%s: Start message bad magic %"PRIx64" != expected %"PRIx64, - iqsync->remote.name, - remote_magic, - IQSYNC_START_MAGIC - ); - return -1; + TSLOGXL(TSERROR, "%s: Start message bad magic %"PRIx64" != expected %"PRIx64, + iqsync->remote.name, + remote_magic, + IQSYNC_START_MAGIC + ); + return -1; } iqsync->local.index = be64toh(start.start_index); //uint64_t flags = be64toh(start.flags); if (iqsync->local.index > iqueue_entries(iq)) - TSLOGX(TSWARN, - "%s: Starting at %"PRIu64", but only %"PRIu64" entries so far", - iqsync->local.name, - iqsync->local.index, - iqueue_entries(iq) - ); + TSLOGXL(TSWARN, + "%s: Starting at %"PRIu64", but only %"PRIu64" entries so far", + iqsync->local.name, + iqsync->local.index, + iqueue_entries(iq) + ); if (iqsync->verbose) - TSLOGX(TSINFO, "%s: Starting at %"PRIu64"/%"PRIu64" and will %s when done", - iqsync->local.name, - iqsync->local.index, - iqueue_entries(iq), - iqsync->do_tail ? "tail" : "exit" + TSLOGXL(TSINFO, "%s: Starting at %"PRIu64"/%"PRIu64" and will %s when done", + iqsync->local.name, + iqsync->local.index, + iqueue_entries(iq), + iqsync->do_tail ? "tail" : "exit" ); return 0; } +static int +iqsync_flush_sendbuffer(iqsync_t * const iqsync) +{ + if (iqsync->sendbuffer_data_len == 0) + return 0; + + ssize_t wlen = write_all( + iqsync->write_fd, + iqsync->sendbuffer, + iqsync->sendbuffer_data_len); + if (wlen <= 0) { + return -1; + } + + iqsync->sendbuffer_data_len = 0; + return 0; +} static int iqsync_push_one( @@ -461,12 +635,12 @@ iqsync_push_one( ) { struct iqsync_data msg = { - .magic = htobe64(IQSYNC_DATA_MAGIC), - .src = htobe64(iqsync->local.creation), - .orig_src = htobe64(iqsync->local.creation), - .orig_index = htobe64(local_index), - .iq_index = htobe64(local_index), - .len = htobe32((uint32_t) data_len), + .magic = htobe64(IQSYNC_DATA_MAGIC), + .src = htobe64(iqsync->local.creation), + .orig_src = htobe64(iqsync->local.creation), + .orig_index = htobe64(local_index), + .iq_index = htobe64(local_index), + .len = htobe32((uint32_t) data_len), }; // Check to see if this is one that we received from @@ -474,51 +648,69 @@ iqsync_push_one( const struct iqsync_data * const sync_msg = iqsync_data_msg(iqsync->iq, offset); if (sync_msg) { - // Avoid the obvious cycle to our direct correspondent - if (sync_msg->src == htobe64(iqsync->remote.creation)) - return 1; - - // Flag the message with the original source and index - // Note that the values in the sync_msg are already in - // network byte order - msg.orig_src = sync_msg->orig_src; - msg.orig_index = sync_msg->orig_index; + // Avoid the obvious cycle to our direct correspondent + if (sync_msg->src == htobe64(iqsync->remote.creation)) + return 1; + + // Flag the message with the original source and index + // Note that the values in the sync_msg are already in + // network byte order + msg.orig_src = sync_msg->orig_src; + msg.orig_index = sync_msg->orig_index; } const void * data = iqueue_get_data(iqsync->iq, offset, 1); struct iovec iov[] = { - { .iov_base = &msg, .iov_len = sizeof(msg) }, - { .iov_base = (void*)(uintptr_t) data, .iov_len = data_len }, + { .iov_base = &msg, .iov_len = sizeof(msg) }, + { .iov_base = (void*)(uintptr_t) data, .iov_len = data_len }, }; size_t total_len = iov[0].iov_len + iov[1].iov_len; if (iqsync->verbose) - TSLOGX(TSINFO, "%s: sending index %"PRIu64": %zu bytes", - iqsync->local.name, - local_index, - data_len - ); - - ssize_t wlen = tsio_writev_all(iqsync->write_fd, total_len, iov, 2); - if (wlen < 0) - { - TSLOG(TSERROR, "%s: write failed!", iqsync->remote.name); - return -1; - } - - if (wlen != (ssize_t) total_len) - { - TSLOGX(TSWARN, "%s: Connection closed", iqsync->remote.name); - return 0; + TSLOGXL(TSINFO, "%s: sending index %"PRIu64": %zu bytes", + iqsync->local.name, + local_index, + data_len + ); + + if (!iqsync->sendbuffer || total_len > iqsync->sendbuffer_len) { + if (iqsync->sendbuffer) { + if (iqsync_flush_sendbuffer(iqsync) < 0) { + TSLOGL(TSERROR, "%s: write failed!", iqsync->remote.name); + return -1; + } + } + ssize_t wlen = writev_all(iqsync->write_fd, total_len, iov, 2); + if (wlen < 0) { + TSLOGL(TSERROR, "%s: write failed!", iqsync->remote.name); + return -1; + } + + if (wlen != (ssize_t) total_len) { + TSLOGXL(TSWARN, "%s: Connection closed", iqsync->remote.name); + return 0; + } + } else { + if (iqsync->sendbuffer_data_len + total_len > iqsync->sendbuffer_len) { + if (iqsync_flush_sendbuffer(iqsync) < 0) { + TSLOGL(TSERROR, "%s: write failed!", iqsync->remote.name); + return -1; + } + } + + memcpy(iqsync->sendbuffer + iqsync->sendbuffer_data_len, + &msg, sizeof(msg)); + memcpy(iqsync->sendbuffer + iqsync->sendbuffer_data_len + sizeof(msg), + data, data_len); + iqsync->sendbuffer_data_len += sizeof(msg) + data_len; } iqsync->local.count++; return 1; } - /** Prevent the aggregate read rate from exceeding the rate limit. * This makes iqsync on a large file less likely to blow out the caches. */ @@ -535,26 +727,26 @@ iqsync_rate_limit( const uint64_t limit = iqsync->rate_limit << 20; // scale MB/s to B/s if (limit == 0) - return; + return; uint64_t delta = tsclock_getnanos(0) - start_time; uint64_t ns_sleep_time = (avg_len * 1000000000ul) / limit; if (ns_sleep_time < delta) - return; + return; if (ns_sleep_time < 60000) { - // Just busy wait until our time period has expired - while ((uint64_t) tsclock_getnanos(0) < start_time + ns_sleep_time) - continue; + // Just busy wait until our time period has expired + while ((uint64_t) tsclock_getnanos(0) < start_time + ns_sleep_time) + continue; } else { - // The minimum sleep time seems to be about 60 usec, - ns_sleep_time -= delta; - nanosleep(&(struct timespec) { - .tv_sec = ns_sleep_time / 1000000000ull, - .tv_nsec = ns_sleep_time % 1000000000ull - }, NULL); + // The minimum sleep time seems to be about 60 usec, + ns_sleep_time -= delta; + nanosleep(&(struct timespec) { + .tv_sec = ns_sleep_time / 1000000000ull, + .tv_nsec = ns_sleep_time % 1000000000ull + }, NULL); } } @@ -573,56 +765,56 @@ iqsync_setup_heartbeats( ) { if (iqsync->heartbeats_hash) - return 1; + return 1; tslock(iqsync->heartbeats_lock); if (iqsync->heartbeats_hash) { - tsunlock(iqsync->heartbeats_lock); - return 1; + tsunlock(iqsync->heartbeats_lock); + return 1; } shash_t * sh = iqueue_writer_table( - iqsync->iq, - 0, // default writer table - create_flag + iqsync->iq, + 0, // default writer table + create_flag ); // If it still doesn't exist, don't worry about it. No heartbeats // will be pushed until it is created. if (!sh) { - if (create_flag) - TSABORTX("%s: Unable to create heartbeat table", - iqsync->local.name - ); + if (create_flag) + TSABORTX("%s: Unable to create heartbeat table", + iqsync->local.name + ); - tsunlock(iqsync->heartbeats_lock); - return 0; + tsunlock(iqsync->heartbeats_lock); + return 0; } // Create a thread-local version sh = shash_copy(sh); iqsync->heartbeats = shash_entries(sh, &iqsync->heartbeats_max); - TSLOGX(TSDEBUG, "%s: Heartbeat table %p has %u entries", - iqsync->local.name, - iqsync->heartbeats, - iqsync->heartbeats_max + TSLOGXL(TSDEBUG, "%s: Heartbeat table %p has %u entries", + iqsync->local.name, + iqsync->heartbeats, + iqsync->heartbeats_max ); iqsync->heartbeats_copy = calloc(1, - iqsync->heartbeats_max * sizeof(*iqsync->heartbeats_copy) + iqsync->heartbeats_max * sizeof(*iqsync->heartbeats_copy) ); if (!iqsync->heartbeats_copy) - TSABORT("failed to allocate %u writers", iqsync->heartbeats_max); + TSABORT("failed to allocate %u writers", iqsync->heartbeats_max); iqsync->heartbeat_msg = calloc(1, - sizeof(*iqsync->heartbeat_msg) - + iqsync->heartbeats_max * sizeof(*iqsync->heartbeat_msg->writers) + sizeof(*iqsync->heartbeat_msg) + + iqsync->heartbeats_max * sizeof(*iqsync->heartbeat_msg->writers) ); if (!iqsync->heartbeat_msg) - TSABORT("failed to allocate %u writers message", iqsync->heartbeats_max); + TSABORT("failed to allocate %u writers message", iqsync->heartbeats_max); iqsync->heartbeats_hash = sh; tsunlock(iqsync->heartbeats_lock); @@ -632,7 +824,7 @@ iqsync_setup_heartbeats( /** Send a set of messages, from iqsync->local.index to end_index. - * \return 0 on success, -1 on any failures. + * \return the number of messages processed on success, -1 on any failures. */ static int iqsync_send_set( @@ -644,93 +836,135 @@ iqsync_send_set( while (iqsync->local.index < end_index) { - const uint64_t i = iqsync->local.index++; - size_t len; - const uint64_t offset = iqueue_offset(iqsync->iq, i, &len); - - if (offset == (uint64_t) -1) - { - TSLOGX(TSERROR, "%s: No data at index %"PRIu64"?", - iqsync->local.name, - i - ); - return -1; - } - - uint64_t start_time = tsclock_getnanos(0); - if (iqsync_push_one(iqsync, i, offset, len) <= 0) - return -1; - - iqsync_rate_limit(iqsync, start_time, len); + const uint64_t idx = iqsync->local.index++; + size_t len; + const uint64_t offset = iqueue_offset(iqsync->iq, idx, &len); + + if (offset == (uint64_t) -1) + { + TSLOGXL(TSERROR, "%s: No data at index %"PRIu64"?", + iqsync->local.name, + idx + ); + return -1; + } + + if (iqsync->filter_count > 0) { + bool filter_out = false; + for (unsigned i = 0; i < iqsync->filter_count; i++) { + iqsync_filter_t *filter = &iqsync->filters[i]; + + int rc = filter->filter_fn(filter->filter_fn_priv, + iqueue_get_data(iqsync->iq, offset, 1), len); + + if (rc == 0) { + filter_out = true; + break; + } else if (rc < 0) { + TSLOGXL(TSINFO, "Filter funtion requested iqsync shutdown"); + iqsync->do_shutdown = 1; + return 0; + } + } + + if (filter_out) + continue; + } + + uint64_t start_time = tsclock_getnanos(0); + if (iqsync_push_one(iqsync, idx, offset, len) <= 0) + return -1; + + iqsync_rate_limit(iqsync, start_time, len); + } + + if (iqsync->sendbuffer && iqsync_flush_sendbuffer(iqsync) < 0) { + TSLOGL(TSERROR, "%s: flush failed!", iqsync->remote.name); + return -1; } if (start_index != end_index) - TSLOGX(TSDEBUG, "%s: Send %"PRIu64" to %"PRIu64, - iqsync->local.name, - start_index, - end_index - ); + TSLOGXL(TSDEBUG, "%s: Send %"PRIu64" to %"PRIu64, + iqsync->local.name, + start_index, + end_index + ); - return 0; + return end_index - start_index; } +/** Sync heartbeats and messages from our local iqueue to the remote iqueue. + * This method will perform the following actions: + * 1. Snapshot the heartbeats + * 2. Send all pending data messages + * 3. Send heartbeats + * We snapshot the hearbeats beforehand to ensure the invariant that any + * heartbeat we send was written before the data we synced. This allows + * users to see the heartbeat as a lowerbound for the data synced. + * \return the number of messages processed on success, -1 on any failures. + */ static int -iqsync_push_heartbeats( +iqsync_push_state( iqsync_t * const iqsync ) { - if (!iqsync_setup_heartbeats(iqsync, 0)) - return 0; - struct iqsync_heartbeat * const msg = iqsync->heartbeat_msg; - - unsigned count = 0; - - for (unsigned i = 0 ; i < iqsync->heartbeats_max ; i++) - { - // This does not guarantee ordering of updates to different keys. - const shash_entry_t heartbeat = iqsync->heartbeats[i]; - shash_entry_t * const copy = &iqsync->heartbeats_copy[i]; - if (heartbeat.key == 0) - break; - if (heartbeat.key == copy->key && heartbeat.value == copy->value) - continue; - - // A new timestamp. Update the cached copy - memcpy(copy, &heartbeat, sizeof(*copy)); - memcpy(&msg->writers[count++], &heartbeat, sizeof(*copy)); + struct iqsync_heartbeat* const hb_msg = iqsync_setup_heartbeats(iqsync, 0) ? + iqsync->heartbeat_msg : NULL; + unsigned heartbeat_count = 0; + if (hb_msg) { + for (unsigned i = 0 ; i < iqsync->heartbeats_max ; i++) + { + // This does not guarantee ordering of updates to different keys. + const shash_entry_t heartbeat = iqsync->heartbeats[i]; + shash_entry_t * const copy = &iqsync->heartbeats_copy[i]; + if (heartbeat.key == 0) + break; + if (heartbeat.key == copy->key && heartbeat.value == copy->value) + continue; + + // A new timestamp. Update the cached copy + memcpy(copy, &heartbeat, sizeof(*copy)); + memcpy(&hb_msg->writers[heartbeat_count++], &heartbeat, + sizeof(*copy)); + } + hb_msg->magic_be64 = htobe64(IQSYNC_HEARTBEAT_MAGIC); + hb_msg->count_be64 = htobe64(heartbeat_count); } - if (count == 0) - return 0; - - msg->magic_be64 = htobe64(IQSYNC_HEARTBEAT_MAGIC); - msg->count_be64 = htobe64(count); // Make sure that all pending messages have been sent // to ensure that heartbeats do not arrive before any messages // that were written before the hearbeat. - if (iqsync_send_set(iqsync, iqueue_entries(iqsync->iq)) < 0) - return -1; + int synced = 0; + if ((synced = iqsync_send_set(iqsync, iqueue_entries(iqsync->iq))) < 0) { + return -1; + } // At this point the cached copy of the heartbeat table might be // old, but it meets the guarantee that all messages that were // present at the time it was duplicated have been sent to the // destination iqueue. It is now safe to send the entire table // of updates. + if (heartbeat_count > 0) { + tsassert(hb_msg); // we should have this, guaranteed + ssize_t wlen = write_all( + iqsync->write_fd, + hb_msg, + sizeof(*hb_msg) + heartbeat_count * sizeof(hb_msg->writers[0]) + ); + if (unlikely(wlen <= 0)) { + TSLOGXL(TSERROR, "%s: Failed to send %d heartbeats", + iqsync->local.name, heartbeat_count); + return -1; + } else { + TSLOGXL(TSDEBUG, "%s: Sent %u heartbeat updates", + iqsync->local.name, heartbeat_count); + } + } - ssize_t wlen = tsio_write_all( - iqsync->write_fd, - msg, - sizeof(*msg) + count * sizeof(msg->writers[0]) - ); - if (wlen <= 0) - return -1; - - TSLOGX(TSDEBUG, "%s: Sent %u heartbeat updates", iqsync->local.name, count); - - return 0; + return synced; } @@ -749,46 +983,40 @@ iqsync_push_thread( while (!iqsync->do_shutdown) { - // By only sending up to the end of the iqueue at the current - // time, we ensure that progress is made on the heartbeat sending. - if (iqsync_send_set(iqsync, iqueue_entries(iq)) < 0) - { - TSLOGX(TSWARN, "%s: Send set failed", iqueue_name(iq)); - break; - } - - // Everytime there is no data, check for heartbeats - if (iqsync_push_heartbeats(iqsync) < 0) - { - TSLOGX(TSWARN, "%s: Heartbeat send failed", iqueue_name(iq)); - break; - } - - if (!iqsync->do_tail) - { - TSLOGX(TSINFO, "%s: Reached end and not tailing", iqueue_name(iq)); - break; - } - - if (iqueue_is_sealed(iq)) - { - TSLOGX(TSINFO, "%s: Has been sealed", iqueue_name(iq)); - break; - } - - if (iqsync->usleep_time) - usleep(iqsync->usleep_time); + int synced = 0; + if ((synced = iqsync_push_state(iqsync)) < 0) + { + TSLOGXL(TSWARN, "%s: State send failed", iqueue_name(iq)); + break; + } + + if (!iqsync->do_tail) + { + TSLOGXL(TSINFO, "%s: Reached end and not tailing", iqueue_name(iq)); + break; + } + + if (iqueue_is_sealed(iq)) + { + TSLOGXL(TSINFO, "%s: Has been sealed", iqueue_name(iq)); + break; + } + + // sleep for a bit if we didn't do any work syncing messages + if (synced == 0 && iqsync->usleep_time) { + usleep(iqsync->usleep_time); + } } if (iqsync->verbose) - TSLOGX(TSINFO, "%s: Done sending at index %"PRIu64, - iqsync->remote.name, - iqsync->local.index + TSLOGXL(TSINFO, "%s: Done sending at index %"PRIu64, + iqsync->remote.name, + iqsync->local.index ); close(iqsync->write_fd); if (!iqsync->do_pull) - iqsync->do_shutdown = 1; + iqsync->do_shutdown = 1; return NULL; } @@ -815,75 +1043,79 @@ iqsync_recv( const uint64_t orig_index = be64toh(msg->orig_index); const uint64_t remote_index = be64toh(msg->iq_index); + if (orig_src == 0) + TSABORTX("Received bad source id! orig_src: %"PRIu64" orig_index: %"PRIu64 + " remote_index: %"PRIu64, orig_src, orig_index, remote_index); + // Adjust the message to skip the data at the head const size_t data_offset = offsetof(struct iqsync_data, data); const iqueue_msg_t new_iqmsg = iqueue_msg( - iqueue_msg_offset(iqmsg) + data_offset, - iqueue_msg_len(iqmsg) - data_offset + iqueue_msg_offset(iqmsg) + data_offset, + iqueue_msg_len(iqmsg) - data_offset ); while (1) { - const uint64_t local_index = iqsync_sources_scan( - iqsync, - orig_src, - orig_index - ); - if (local_index == IQUEUE_MSG_BAD_ID) - return 0; - - // Try to store the new entry at the last slot scanned. - // Note that the index entry points to the data section, - // not the iqsync_msg header portion (which will be in the file for - // future reference). - int rc = iqueue_try_update( - iqsync->iq, - local_index, - new_iqmsg - ); - - // We were not successful; rescan the sources and try again - if (rc == IQUEUE_STATUS_HAS_DATA) - { - TSLOGX(TSDEBUG, "%s: Lost race at %"PRIu64" for %"PRIx64".%"PRIu64, - iqsync->local.name, - local_index, - orig_src, - orig_index - ); - continue; - } - - if (rc == IQUEUE_STATUS_SEALED) - { - TSLOGX(TSWARN, "%s: File has been sealed. Stopping sync.", iqueue_name(iqsync->iq)); - return -1; - } - - if (rc != 0) - { - TSLOGX(TSERROR, "%s: Unable to store %zu bytes at %"PRIu64"! rc=%d", - iqsync->local.name, - data_len, - local_index, - rc - ); - return -1; - } - - // We have successfully written at the desired slot, which means - // no new messages arrived while we were consulting the hash tables. - if (iqsync->verbose) - TSLOGX(TSINFO, "%s: Stored remote %"PRIu64" as %"PRIu64, - iqsync->local.name, - remote_index, - local_index - ); - - iqsync->remote.count++; - iqsync->remote.index = remote_index; - - return 1; + const uint64_t local_index = iqsync_sources_scan( + iqsync, + orig_src, + orig_index + ); + if (local_index == IQUEUE_MSG_BAD_ID) + return 0; + + // Try to store the new entry at the last slot scanned. + // Note that the index entry points to the data section, + // not the iqsync_msg header portion (which will be in the file for + // future reference). + int rc = iqueue_try_update( + iqsync->iq, + local_index, + new_iqmsg + ); + + // We were not successful; rescan the sources and try again + if (rc == IQUEUE_STATUS_HAS_DATA) + { + TSLOGXL(TSDEBUG, "%s: Lost race at %"PRIu64" for %"PRIx64".%"PRIu64, + iqsync->local.name, + local_index, + orig_src, + orig_index + ); + continue; + } + + if (rc == IQUEUE_STATUS_SEALED) + { + TSLOGXL(TSWARN, "%s: File has been sealed. Stopping sync.", iqueue_name(iqsync->iq)); + return -1; + } + + if (rc != 0) + { + TSLOGXL(TSERROR, "%s: Unable to store %zu bytes at %"PRIu64"! rc=%d", + iqsync->local.name, + data_len, + local_index, + rc + ); + return -1; + } + + // We have successfully written at the desired slot, which means + // no new messages arrived while we were consulting the hash tables. + if (iqsync->verbose) + TSLOGXL(TSINFO, "%s: Stored remote %"PRIu64" as %"PRIu64, + iqsync->local.name, + remote_index, + local_index + ); + + iqsync->remote.count++; + iqsync->remote.index = remote_index; + + return 1; } } @@ -898,25 +1130,25 @@ iqsync_pull_one_data( iqueue_msg_t iqmsg; struct iqsync_data * const msg = iqueue_allocate( - allocator, - alloc_len, - &iqmsg + allocator, + alloc_len, + &iqmsg ); if (!msg) { - TSLOGX(TSERROR, "%s: Unable to allocate message", iqsync->local.name); - return -1; + TSLOGXL(TSERROR, "%s: Unable to allocate message", iqsync->local.name); + return -1; } - ssize_t rlen = tsio_read_all( - iqsync->read_fd, - ((uint8_t*) msg) + sizeof(msg->magic), - sizeof(*msg) - sizeof(msg->magic) + ssize_t rlen = iqsync_read( + iqsync, + ((uint8_t*) msg) + sizeof(msg->magic), + sizeof(*msg) - sizeof(msg->magic) ); if (rlen < 0) - return -1; // error + return -1; // error if (rlen != sizeof(*msg) - sizeof(msg->magic)) - return 0; // closed fd + return 0; // closed fd // Fill in the magic header msg->magic = htobe64(IQSYNC_DATA_MAGIC); @@ -924,8 +1156,8 @@ iqsync_pull_one_data( // If this is just a keep-alive, we have nothing else to process if (msg->len == 0 && msg->src == 0 && msg->orig_src == 0) { - iqueue_realloc_bulk(allocator, &iqmsg, alloc_len, 0); - return 1; + iqueue_realloc_bulk(allocator, &iqmsg, alloc_len, 0); + return 1; } const size_t data_len = be32toh(msg->len); @@ -935,42 +1167,42 @@ iqsync_pull_one_data( if (data_len > IQUEUE_MSG_MAX) { - TSLOGX(TSERROR, "%s: Message %"PRIu64" len %zu greater than max %zu", - iqsync->remote.name, - be64toh(msg->iq_index), - data_len, - (size_t) IQUEUE_MSG_MAX - ); - return -1; + TSLOGXL(TSERROR, "%s: Message %"PRIu64" len %zu greater than max %zu", + iqsync->remote.name, + be64toh(msg->iq_index), + data_len, + (size_t) IQUEUE_MSG_MAX + ); + return -1; } - rlen = tsio_read_all(iqsync->read_fd, msg->data, data_len); + rlen = iqsync_read(iqsync, msg->data, data_len); if (rlen < 0) - return -1; // error + return -1; // error if (rlen != (ssize_t) data_len) - return 0; // closed fd + return 0; // closed fd if (iqueue_realloc_bulk( - allocator, - &iqmsg, - alloc_len, - msg_len + allocator, + &iqmsg, + alloc_len, + msg_len ) < 0) { - TSLOGX(TSERROR, "%s: Unable to resize from %zu to %zu?", - iqsync->local.name, - alloc_len, - msg_len - ); - return -1; + TSLOGXL(TSERROR, "%s: Unable to resize from %zu to %zu?", + iqsync->local.name, + alloc_len, + msg_len + ); + return -1; } // Now that the message has been fully received into the buffer, // try to post it to the iqueue. int rc = iqsync_recv(iqsync, msg, iqmsg); if (rc == 1) - return 1; + return 1; if (rc < 0) - return -1; + return -1; // Too old or from ourselves; discard it, but do not signal an error iqueue_realloc_bulk(allocator, &iqmsg, alloc_len, 0); @@ -988,49 +1220,49 @@ iqsync_pull_one_heartbeat( struct iqsync_heartbeat msg; ssize_t rlen; - rlen = tsio_read_all( - iqsync->read_fd, - ((uint8_t*) &msg) + sizeof(msg.magic_be64), - sizeof(msg) - sizeof(msg.magic_be64) + rlen = iqsync_read( + iqsync, + ((uint8_t*) &msg) + sizeof(msg.magic_be64), + sizeof(msg) - sizeof(msg.magic_be64) ); if (rlen <= 0) - return (int) rlen; + return (int) rlen; const uint64_t count = be64toh(msg.count_be64); if (count > iqsync->heartbeats_max) - TSABORTX("%s: Sent %"PRIu64" heartbeats? Max %u", - iqsync->remote.name, - count, - iqsync->heartbeats_max - ); + TSABORTX("%s: Sent %"PRIu64" heartbeats? Max %u", + iqsync->remote.name, + count, + iqsync->heartbeats_max + ); shash_entry_t heartbeats[count]; - rlen = tsio_read_all(iqsync->read_fd, heartbeats, sizeof(heartbeats)); + rlen = iqsync_read(iqsync, heartbeats, sizeof(heartbeats)); if (rlen <= 0) - return rlen; + return rlen; for (unsigned i = 0 ; i < count ; i++) { - shash_entry_t * const heartbeat = &heartbeats[i]; - - if (heartbeat->key == 0 || heartbeat->key == ~(uint64_t) 0) - { - TSLOGX(TSWARN, "%s: Sent writer with invalid id/timestamp %"PRIx64":%"PRIx64"?", - iqsync->remote.name, - heartbeat->key, - heartbeat->value - ); - continue; - } - - iqsync_hash_update( - iqsync->heartbeats_hash, - heartbeat->key, - heartbeat->value - ); + shash_entry_t * const heartbeat = &heartbeats[i]; + + if (heartbeat->key == 0 || heartbeat->key == ~(uint64_t) 0) + { + TSLOGXL(TSWARN, "%s: Sent writer with invalid id/timestamp %"PRIx64":%"PRIx64"?", + iqsync->remote.name, + heartbeat->key, + heartbeat->value + ); + continue; + } + + iqsync_hash_update( + iqsync->heartbeats_hash, + heartbeat->key, + heartbeat->value + ); } - TSLOGX(TSDEBUG, "Received %"PRIu64" heartbeats", count); + TSLOGXL(TSDEBUG, "Received %"PRIu64" heartbeats", count); return 1; } @@ -1046,21 +1278,21 @@ iqsync_pull_one( ) { uint64_t magic_be64; - ssize_t rlen = tsio_read_all(iqsync->read_fd, &magic_be64, sizeof(magic_be64)); + ssize_t rlen = iqsync_read(iqsync, &magic_be64, sizeof(magic_be64)); if (rlen < 0) - return -1; + return -1; if (rlen != sizeof(magic_be64)) - return 0; + return 0; const uint64_t magic = be64toh(magic_be64); if (magic == IQSYNC_DATA_MAGIC) - return iqsync_pull_one_data(iqsync, allocator); + return iqsync_pull_one_data(iqsync, allocator); if (magic == IQSYNC_HEARTBEAT_MAGIC) - return iqsync_pull_one_heartbeat(iqsync); + return iqsync_pull_one_heartbeat(iqsync); - TSLOGX(TSERROR, "%s: Bad magic %"PRIx64". Unknown type!", - iqsync->remote.name, - magic + TSLOGXL(TSERROR, "%s: Bad magic %"PRIx64". Unknown type!", + iqsync->remote.name, + magic ); return -1; } @@ -1084,30 +1316,30 @@ iqsync_start_send( // in computing the starting index, it might be too low and the first // few incoming packets will be dropped. iqsync->remote.index = iqsync_sources_get( - iqsync, - iqsync->remote.creation + iqsync, + iqsync->remote.creation ); // Send the ack asking to start at the next message, or 0 // if there is no value already recorded for this source. struct iqsync_start start = { - .magic = htobe64(IQSYNC_START_MAGIC), - .start_index = htobe64(iqsync->remote.index), - .flags = htobe64(0), + .magic = htobe64(IQSYNC_START_MAGIC), + .start_index = htobe64(iqsync->remote.index), + .flags = htobe64(0), }; - if (tsio_write_all( - iqsync->write_fd, - &start, - sizeof(start) + if (write_all( + iqsync->write_fd, + &start, + sizeof(start) ) != sizeof(start)) { - TSLOGX(TSERROR, "%s: Write error on start message", iqsync->remote.name); - return -1; + TSLOGXL(TSERROR, "%s: Write error on start message", iqsync->remote.name); + return -1; } if (iqsync->verbose) - TSLOGX(TSINFO, "send RX request start at %"PRIu64, iqsync->remote.index); + TSLOGXL(TSINFO, "send RX request start at %"PRIu64, iqsync->remote.index); return 0; } @@ -1128,33 +1360,40 @@ iqsync_pull_thread( // Pre-allocate some space for incoming messages iqueue_allocator_t allocator; if (iqueue_allocator_init( - iqsync->iq, - &allocator, - IQUEUE_MSG_MAX * 4, // try to avoid re-filling too often - 1 + iqsync->iq, + &allocator, + IQUEUE_MSG_MAX * 4, // try to avoid re-filling too often + 1 ) < 0) - TSABORTX("%s: Unable to create allocator", iqsync->local.name); + TSABORTX("%s: Unable to create allocator", iqsync->local.name); // Read messages until we have an error or a closed connection int rc; while ((rc = iqsync_pull_one(iqsync, &allocator)) == 1) { - // nop - // \todo: check for sealed iqueue + if (iqsync->do_shutdown) + break; + // nop + // \todo: check for sealed iqueue } - if (rc == 0) - { - if (iqsync->verbose) - TSLOGX(TSINFO, "%s: Connection closed: index %"PRIu64, - iqsync->remote.name, - iqsync->remote.index - ); + if (iqsync->do_shutdown) { + if (iqsync->verbose) + TSLOGXL(TSINFO, "%s: Pull thread detected shutdown: index %"PRIu64, + iqsync->remote.name, + iqsync->remote.index + ); + } else if (rc == 0) { + if (iqsync->verbose) + TSLOGXL(TSINFO, "%s: Connection closed: index %"PRIu64, + iqsync->remote.name, + iqsync->remote.index + ); } else { - TSLOG(TSERROR, "%s: Read failed: index %"PRIu64, - iqsync->remote.name, - iqsync->remote.index - ); + TSLOGL(TSERROR, "%s: Read failed: index %"PRIu64, + iqsync->remote.name, + iqsync->remote.index + ); } iqsync->do_shutdown = 1; @@ -1175,42 +1414,65 @@ iqsync_handshake_send( { iqueue_t * const iq = iqsync->iq; + // setup filter + if (iqsync->filter_count > 0) { + size_t hdr_len; + const void *hdr_buf = iqueue_header(iq, &hdr_len); + for (unsigned i = 0; i < iqsync->filter_count; i++) + { + TSLOGXL(TSINFO, "trying to setup %u-th iqsync filter", i); + iqsync_filter_t *filter = &iqsync->filters[i]; + if (!filter->filter_setup) { + if (!filter->filter_fn) { + TSABORTX("either setup or filter function is required for %u-th iqsync filter", i); + } + TSLOGXL(TSINFO, "%u-th iqsync filter does not have setup function", i); + } else { + if (filter->filter_setup(hdr_buf, hdr_len, + &filter->filter_fn_priv, &filter->filter_fn) == -1) { + TSABORT("could not setup %u-th requested filter", i); + } + TSLOGXL(TSINFO, "successfully setup %u-th iqsync filter", i); + } + } + } + iqsync->local.hdr = iqueue_header(iq, &iqsync->local.hdr_len); iqsync->local.creation = iqueue_creation(iq); iqsync->local.entries = iqueue_entries(iq); if (iqsync->verbose) - TSLOGX(TSINFO, "%s: Source creation=%"PRIu64" entries=%"PRIu64, - iqsync->local.name, - iqsync->local.creation, - iqsync->local.entries + TSLOGXL(TSINFO, "%s: Source creation=%"PRIu64" entries=%"PRIu64, + iqsync->local.name, + iqsync->local.creation, + iqsync->local.entries ); struct iqsync_handshake handshake = { - .magic = htobe64(IQSYNC_HANDSHAKE_MAGIC), - .creation = htobe64(iqsync->local.creation), - .entries = htobe64(iqsync->local.entries), - .hdr_len = htobe64(iqsync->local.hdr_len), + .magic = htobe64(IQSYNC_HANDSHAKE_MAGIC), + .creation = htobe64(iqsync->local.creation), + .entries = htobe64(iqsync->local.entries), + .hdr_len = htobe64(iqsync->local.hdr_len), }; // \todo Is this safe? What if hdr is long? Should there be // a split-phase handshake to ensure that we do not deadlock? struct iovec iov[] = { - { .iov_base = &handshake, .iov_len = sizeof(handshake) }, - { .iov_base = (void*)(uintptr_t) iqsync->local.hdr, .iov_len = iqsync->local.hdr_len }, + { .iov_base = &handshake, .iov_len = sizeof(handshake) }, + { .iov_base = (void*)(uintptr_t) iqsync->local.hdr, .iov_len = iqsync->local.hdr_len }, }; size_t total_len = iov[0].iov_len + iov[1].iov_len; - ssize_t wlen = tsio_writev_all(iqsync->write_fd, total_len, iov, 2); + ssize_t wlen = writev_all(iqsync->write_fd, total_len, iov, 2); if (wlen != (ssize_t) total_len) { - TSLOGX(TSERROR, - "%s: handshake write failed: %zd != %zu", - iqsync->remote.name, - wlen, - total_len - ); - return -1; + TSLOGL(TSERROR, + "%s: handshake write failed: %zd != %zu", + iqsync->remote.name, + wlen, + total_len + ); + return -1; } return 0; @@ -1227,49 +1489,88 @@ iqsync_handshake_recv( { // Read the handshake from the remote side struct iqsync_handshake reply; - ssize_t rlen = tsio_read_all(iqsync->read_fd, &reply, sizeof(reply)); + ssize_t rlen = iqsync_read(iqsync, &reply, sizeof(reply)); if (rlen != sizeof(reply)) { - TSLOGX(TSERROR, "%s: handshake read failed", iqsync->remote.name); - return -1; + TSLOGXL(TSERROR, "%s: handshake read failed", iqsync->remote.name); + return -1; } const uint64_t remote_magic = be64toh(reply.magic); if (remote_magic != IQSYNC_HANDSHAKE_MAGIC) { - TSLOGX(TSERROR, "%s: bad handshake magic: %"PRIu64" != %"PRIu64, - iqsync->remote.name, - remote_magic, - IQSYNC_HANDSHAKE_MAGIC - ); - return -1; + TSLOGXL(TSERROR, "%s: bad handshake magic: %"PRIu64" != %"PRIu64, + iqsync->remote.name, + remote_magic, + IQSYNC_HANDSHAKE_MAGIC + ); + return -1; } - iqsync->remote.creation = be64toh(reply.creation); - iqsync->remote.entries = be64toh(reply.entries); - iqsync->remote.hdr_len = be64toh(reply.hdr_len); + iqsync->remote.creation = be64toh(reply.creation); + iqsync->remote.entries = be64toh(reply.entries); + iqsync->remote.hdr_len = be64toh(reply.hdr_len); if (iqsync->remote.hdr_len == 0) - return 0; + return 0; iqsync->remote.hdr = malloc(iqsync->remote.hdr_len); if (!iqsync->remote.hdr) - TSABORT("hdr alloc failed: %"PRIu64" bytes", iqsync->remote.hdr_len); + TSABORT("hdr alloc failed: %"PRIu64" bytes", iqsync->remote.hdr_len); - if (tsio_read_all( - iqsync->read_fd, - iqsync->remote.hdr, - iqsync->remote.hdr_len + if (iqsync_read( + iqsync, + iqsync->remote.hdr, + iqsync->remote.hdr_len ) != (ssize_t) iqsync->remote.hdr_len) { - TSLOGX(TSERROR, "read of remote header failed"); - return -1; + TSLOGXL(TSERROR, "read of remote header failed"); + return -1; } return 0; } +static int +iqsync_header_verify( + iqsync_t * const iqsync +) +{ + if (!iqsync->do_hdr_validate) + return 0; + + size_t local_hdr_len; + const void * local_hdr = iqueue_header(iqsync->iq, &local_hdr_len); + + if (iqsync->remote.hdr_len != local_hdr_len) + { + TSLOGXL(TSERROR, "%s: remote header %"PRIu64" bytes != local %"PRIu64, + iqsync->remote.name, + iqsync->remote.hdr_len, + local_hdr_len + ); + return -1; + } + + if (memcmp(iqsync->remote.hdr, local_hdr, local_hdr_len) != 0) + { + TSLOGXL(TSERROR, "Remote header"); + TSHDUMPL(TSERROR, iqsync->remote.hdr, iqsync->remote.hdr_len); + TSLOGXL(TSERROR, "Local header"); + TSHDUMPL(TSERROR, local_hdr, local_hdr_len); + return -1; + } + + TSLOGXL(TSINFO, "%s: Headers verified with %s", + iqsync->local.name, + iqsync->remote.name + ); + + return 0; +} + + /** Receive the remote handshake and create the local iqueue based on the * remote parameters. @@ -1279,44 +1580,47 @@ iqsync_handshake_clone( iqsync_t * const iqsync ) { - TSLOGX(TSINFO, "%s: Not present; cloning from remote %s", - iqsync->local.name, - iqsync->remote.name + TSLOGXL(TSINFO, "%s: cloning from remote %s", + iqsync->local.name, + iqsync->remote.name ); // Receive the remote handshake message before sending ours if (iqsync_handshake_recv(iqsync) < 0) - return -1; + return -1; iqsync->local.hdr_len = iqsync->remote.hdr_len; if (iqsync->iq) { - TSLOGX(TSINFO, "%s: Using existing local iqueue", iqueue_name(iqsync->iq)); + TSLOGXL(TSINFO, "%s: Using existing local iqueue", iqueue_name(iqsync->iq)); } else { - const uint64_t local_creation = tsclock_getnanos(0); - iqsync->iq = iqueue_create( - iqsync->local.name, - local_creation, - iqsync->remote.hdr, - iqsync->remote.hdr_len - ); - - if (!iqsync->iq) - { - TSLOGX(TSERROR, "%s: Unable to open", iqsync->local.name); - return -1; - } - if (iqueue_creation(iqsync->iq) != local_creation) - { - TSLOGX(TSWARN, "%s: already exists; verify header?", iqsync->local.name); - } + const uint64_t local_creation = tsclock_getnanos(0); + iqsync->iq = iqueue_create( + iqsync->local.name, + local_creation, + iqsync->remote.hdr, + iqsync->remote.hdr_len + ); + + if (!iqsync->iq) + { + TSLOGXL(TSERROR, "%s: Unable to open", iqsync->local.name); + return -1; + } + if (iqueue_creation(iqsync->iq) != local_creation) + { + TSLOGXL(TSWARN, "%s: already existed", iqsync->local.name); + } } + if (iqsync_header_verify(iqsync) < 0) + return -1; + // Exchange handshake messages now that we have an iqueue created if (iqsync_handshake_send(iqsync) < 0) - return -1; + return -1; return 0; } @@ -1333,46 +1637,27 @@ iqsync_handshake_normal( if (iqsync->iq != NULL) { - iqsync->local.name = iqueue_name(iqsync->iq); + iqsync->local.name = iqueue_name(iqsync->iq); } else { - iqsync->iq = iqueue_open(iqsync->local.name, writable); - if (!iqsync->iq) - { - TSLOGX(TSERROR, "%s: Unable to open", iqsync->local.name); - return -1; - } - iqsync->close_iq_on_shutdown = true; + iqsync->iq = iqueue_open(iqsync->local.name, writable); + if (!iqsync->iq) + { + TSLOGXL(TSERROR, "%s: Unable to open", iqsync->local.name); + return -1; + } + iqsync->close_iq_on_shutdown = true; } // Exchange handshake messages if (iqsync_handshake_send(iqsync) < 0) - return -1; + return -1; if (iqsync_handshake_recv(iqsync) < 0) - return -1; + return -1; - if (!iqsync->do_hdr_validate) - return 0; - - if (iqsync->remote.hdr_len != iqsync->local.hdr_len) - { - TSLOGX(TSERROR, "%s: remote header %"PRIu64" bytes != local %"PRIu64, - iqsync->remote.name, - iqsync->remote.hdr_len, - iqsync->local.hdr_len - ); - return -1; - } - - if (memcmp(iqsync->remote.hdr, iqsync->local.hdr, iqsync->local.hdr_len) != 0) - { - TSLOGX(TSERROR, "Remote header"); - TSHDUMP(TSERROR, iqsync->remote.hdr, iqsync->remote.hdr_len); - TSLOGX(TSERROR, "Local header"); - TSHDUMP(TSERROR, iqsync->local.hdr, iqsync->local.hdr_len); - return -1; - } + if (iqsync_header_verify(iqsync) < 0) + return -1; return 0; } @@ -1386,22 +1671,22 @@ iqsync_send_hb_thread( iqsync_t * const iqsync = iqsync_ptr; const struct iqsync_data msg = { - .magic = htobe64(IQSYNC_DATA_MAGIC), - .src = 0, - .orig_src = 0, - .len = 0, + .magic = htobe64(IQSYNC_DATA_MAGIC), + .src = 0, + .orig_src = 0, + .len = 0, }; while (!iqsync->do_shutdown) { - sleep(1); + sleep(1); - ssize_t wlen = write(iqsync->write_fd, &msg, sizeof(msg)); - if (wlen == sizeof(msg)) - continue; + ssize_t wlen = write_all(iqsync->write_fd, &msg, sizeof(msg)); + if (wlen == sizeof(msg)) + continue; - TSLOG(TSERROR, "%s: Short write", iqsync->remote.name); - break; + TSLOGL(TSERROR, "%s: Short write", iqsync->remote.name); + break; } iqsync->do_shutdown = 1; @@ -1420,22 +1705,22 @@ iqsync_recv_hb_thread( while (!iqsync->do_shutdown) { - struct iqsync_data msg; - ssize_t rlen = read(iqsync->read_fd, &msg, sizeof(msg)); - if (rlen != sizeof(msg)) - { - TSLOG(TSERROR, "%s: Short read?", iqsync->remote.name); - break; - } - - if (msg.magic == htobe64(IQSYNC_DATA_MAGIC) - && msg.len == 0 - && msg.src == 0 - ) - continue; - - TSLOGX(TSWARN, "%s: Sent non-empty heartbeat?", iqsync->remote.name); - break; + struct iqsync_data msg; + ssize_t rlen = iqsync_read(iqsync, &msg, sizeof(msg)); + if (rlen != sizeof(msg)) + { + TSLOGL(TSERROR, "%s: Short read?", iqsync->remote.name); + break; + } + + if (msg.magic == htobe64(IQSYNC_DATA_MAGIC) + && msg.len == 0 + && msg.src == 0 + ) + continue; + + TSLOGXL(TSWARN, "%s: Sent non-empty heartbeat?", iqsync->remote.name); + break; } iqsync->do_shutdown = 1; @@ -1445,13 +1730,40 @@ iqsync_recv_hb_thread( } +/** Start the iqsync handshake process and spin off the send/receive + * threads that handle the exchange of data (depending on --push / --pull). + * If --report-interval is specified a stat reporting thread will also + * be created. + * + * \todo Take advantage of atomic creation code. + */ static void * iqsync_init_thread( void * const iqsync_ptr ) { + bool close_read_fd = true; + iqsync_t * const iqsync = iqsync_ptr; + if (iqsync->do_clone) { + if (iqsync_handshake_clone(iqsync) < 0) + goto init_fail; + } else { + if (iqsync_handshake_normal(iqsync) < 0) + goto init_fail; + } + + if (iqsync->do_prefetch + && iqueue_prefetch_thread(iqsync->iq, + &iqsync->prefetch_thread) != 0) + goto init_fail; + + if (iqsync->do_syncbehind + && iqueue_syncbehind_thread(iqsync->iq, + &iqsync->syncbehind_thread) != 0) + goto init_fail; + // Rescan to build our table of sources; we don't care about what // we find, so we ignore the result and don't look for anything in // particular. We only do this if we are pulling; push only mode @@ -1459,26 +1771,24 @@ iqsync_init_thread( // to start. This saves walking the entire file for a read-only mode if (iqsync->do_pull) { - // If the file has been iqsync'ed already, then the sources - // will be updated into the writer table and not much should need to - // be scanned. Bring things up to date with the end of the file - // just in case. - iqsync_sources_setup(iqsync); - iqsync_sources_scan_all(iqsync, 0, NULL); - - // Handshake and scan done, exchange start messages - - if (iqsync_start_send(iqsync) < 0) { - iqsync->do_shutdown = 1; - return NULL; - } + // If the file has been iqsync'ed already, then the sources + // will be updated into the writer table and not much should need to + // be scanned. Bring things up to date with the end of the file + // just in case. + iqsync_sources_setup(iqsync); + iqsync_sources_scan_all(iqsync, 0, NULL); + + // Handshake and scan done, exchange start messages + + if (iqsync_start_send(iqsync) < 0) { + goto init_fail; + } } // Handshake and scan done, exchange start messages if (iqsync->do_push && iqsync_start_recv(iqsync) < 0) { - iqsync->do_shutdown = 1; - return NULL; + goto init_fail; } // Start the clock @@ -1488,49 +1798,78 @@ iqsync_init_thread( // socket to detect a close. The pull thread doesn't do any // CPU pinning since it is spending all of its time in a read if (pthread_create( - &iqsync->pull_thread, - NULL, - iqsync->do_pull ? iqsync_pull_thread : iqsync_recv_hb_thread, - iqsync - ) < 0) - TSABORTX("Unable to create pull thread"); + &iqsync->pull_thread, + NULL, + iqsync->do_pull ? iqsync_pull_thread : iqsync_recv_hb_thread, + iqsync + ) < 0) { + TSLOGXL(TSERROR, "Unable to create pull thread"); + goto init_fail; + } + + // We have successfully created the pull thread, which now owns the read fd + close_read_fd = false; // Likewise, the report thread is sleeping most of the time so it does // not do any cpu pinning. + pthread_mutex_init(&iqsync->stats_shutdown_mutex, NULL); + pthread_cond_init(&iqsync->stats_shutdown_cond, NULL); + if (iqsync->report_interval - && pthread_create(&iqsync->stat_thread, NULL, iqsync_stat_thread, iqsync) < 0) - TSABORTX("Unable to create stats thread"); + && pthread_create(&iqsync->stat_thread, NULL, iqsync_stat_thread, iqsync) < 0) { + TSLOGXL(TSERROR, "Unable to create stats thread"); + goto init_fail; + } // TODO: It would be better to do the work above on the pinned cpu, but the // spawned threads would inherit the affinity mask. This could be - // re-factored to workj better (hold the threads or saved the mask). - if (iqsync->local_cpu) + // re-factored to work better (hold the threads or save the mask). + if (iqsync->local_cpu && iqsync->do_push) { - char * end; - int cpu = strtoul(iqsync->local_cpu, &end, 0); - if (end == iqsync->local_cpu) - TSABORTX("Unable to parse local cpu '%s'", iqsync->local_cpu); - if (tssched_set_thread_affinity(pthread_self(), cpu) < 0) - TSABORT("Unable to set cpu affinity to %d", cpu); - TSLOGX(TSINFO, "Pinned push thread to cpu %d", cpu); + char * end; + int cpu = strtoul(iqsync->local_cpu, &end, 0); + if (end == iqsync->local_cpu) { + TSLOGXL(TSERROR, "Unable to parse local cpu '%s'", iqsync->local_cpu); + goto init_fail; + } + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + + if ((errno = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset))) { + TSLOGL(TSERROR, "Unable to set cpu affinity to %d", cpu); + goto init_fail; + } + TSLOGXL(TSINFO, "Pinned push thread to cpu %d", cpu); } + iqsync->initialization_rc = 1; + if (iqsync->do_push) - return iqsync_push_thread(iqsync); + return iqsync_push_thread(iqsync); else - return iqsync_send_hb_thread(iqsync); -} + return iqsync_send_hb_thread(iqsync); +init_fail: + iqsync->do_shutdown = 1; + iqsync->initialization_rc = -1; -/** Start the iqsync handshake process and spin off the send/receive - * threads that handle the exchange of data (depending on --push / --pull). - * If --report-interval is specified a stat reporting thread will also - * be created. - * - * \todo Take advantage of atomic creation code. + // We will close the read fd in this funciton only if we have not yet + // created the push thread + if (close_read_fd) + close(iqsync->read_fd); + + // This thread besomes the write thread in teh functions above, so any + // failure means we shoudl close the write fd + close(iqsync->write_fd); + + return NULL; +} + +/** Start iqsync initialization thread */ int -iqsync_start( +iqsync_start_async( iqsync_t * const iqsync ) { @@ -1539,66 +1878,125 @@ iqsync_start( if (iqsync->do_clone && stat(iqsync->local.name, &statbuf) < 0) { - if (errno != ENOENT) - { - TSLOG(TSERROR, "%s: Unable to stat", iqsync->local.name); - return -1; - } - - if (iqsync_handshake_clone(iqsync) < 0) - return -1; + if (errno != ENOENT) + { + TSLOGL(TSERROR, "%s: Unable to stat", iqsync->local.name); + return -1; + } + } + + iqsync->initialization_rc = 0; + iqsync->wait_complete = 0; + + if (iqsync->use_recvbuffer && iqsync->do_pull) { + if (iqsync->recvbuffer_len == 0) + iqsync->recvbuffer_len = DEFAULT_RECVBUFFER_SIZE; + + assert(iqsync->recvbuffer_len == ceilintpow2(iqsync->recvbuffer_len)); + TSLOGX(TSINFO, "use recvbuffer of %"PRIu32" bytes", iqsync->recvbuffer_len); + + iqsync->recvbuffer_block_shift = ceilintlog2(iqsync->recvbuffer_len); + iqsync->recvbuffer_offset_mask = iqsync->recvbuffer_len - 1; + + iqsync->recvbuffer = calloc(iqsync->recvbuffer_len, sizeof(char)); + assert(iqsync->recvbuffer); } else { - if (iqsync_handshake_normal(iqsync) < 0) - return -1; + iqsync->recvbuffer = NULL; } - if (iqsync->do_prefetch - && iqueue_prefetch_thread(iqsync->iq, - &iqsync->prefetch_thread) != 0) - return -1; + if (iqsync->use_sendbuffer && iqsync->do_push) { + if (iqsync->sendbuffer_len == 0) + iqsync->sendbuffer_len = DEFAULT_SENDBUFFER_SIZE; - if (iqsync->do_syncbehind - && iqueue_syncbehind_thread(iqsync->iq, - &iqsync->syncbehind_thread) != 0) - return -1; + TSLOGX(TSINFO, "use sendbuffer of %"PRIu32" bytes", iqsync->sendbuffer_len); + assert(iqsync->sendbuffer_len == ceilintpow2(iqsync->sendbuffer_len)); + + iqsync->sendbuffer_block_shift = ceilintlog2(iqsync->sendbuffer_len); + iqsync->sendbuffer_offset_mask = iqsync->sendbuffer_len - 1; + + iqsync->sendbuffer = calloc(iqsync->sendbuffer_len, sizeof(char)); + assert(iqsync->sendbuffer); + } else { + iqsync->sendbuffer = NULL; + } // And kick off the threads to do the real work (the init thread // will become the push thread) if (pthread_create( - &iqsync->push_thread, - NULL, - iqsync_init_thread, - iqsync - ) < 0) - TSABORTX("Unable to create push thread"); + &iqsync->push_thread, + NULL, + iqsync_init_thread, + iqsync + ) < 0) { + TSLOGXL(TSERROR, "Unable to create push thread"); + + // We failed to create the init thread, so close fds here + close(iqsync->read_fd); + close(iqsync->write_fd); + + return -1; + } return 0; } +/** Wait for iqsync initialization to complete and return non-zero if it failed + */ +int +iqsync_start_async_wait( + iqsync_t * iqsync +) +{ + volatile int64_t *init_rc = &iqsync->initialization_rc; + + while (*init_rc == 0) + sched_yield(); + + return (*init_rc > 0) ? 0 : -1; +} + +/** Start and initialize iqsync synchronously + */ +int +iqsync_start( + iqsync_t * const iqsync +) +{ + if (iqsync_start_async(iqsync) != 0) + return -1; + + return iqsync_start_async_wait(iqsync); +} + int iqsync_wait( iqsync_t * const iqsync ) { - // Wait for the thread to exit - pthread_join(iqsync->push_thread, NULL); - pthread_join(iqsync->pull_thread, NULL); + if (iqsync->wait_complete) + return 0; - if (iqsync->report_interval) - pthread_cancel(iqsync->stat_thread); + // Wait for the thread to exit + if (iqsync->push_thread != 0) + pthread_join(iqsync->push_thread, NULL); + if (iqsync->pull_thread != 0) + pthread_join(iqsync->pull_thread, NULL); + + if (iqsync->report_interval && iqsync->stat_thread != 0) { + pthread_cond_signal(&iqsync->stats_shutdown_cond); + pthread_join(iqsync->stat_thread, NULL); + } if (!iqsync->do_server) - iqsync_stats(iqsync); + iqsync_stats(iqsync); if (iqsync->verbose) - TSLOGX(TSINFO, "Exiting"); + TSLOGXL(TSINFO, "Exiting"); if (iqsync->close_iq_on_shutdown) - iqueue_close(iqsync->iq); - - close(iqsync->read_fd); - close(iqsync->write_fd); + iqueue_close(iqsync->iq); + iqsync->wait_complete = 1; return 0; } diff --git a/src/iqueue-main.c b/src/iqueue-main.c index d6f58fa..4549f24 100644 --- a/src/iqueue-main.c +++ b/src/iqueue-main.c @@ -1,14 +1,17 @@ -/* $TwoSigma: iqueue-main.c,v 1.17 2012/02/02 20:48:52 thudson Exp $ */ - /* - * Copyright (c) 2010 Two Sigma Investments, LLC - * All Rights Reserved + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF - * Two Sigma Investments, LLC. + * http://www.apache.org/licenses/LICENSE-2.0 * - * The copyright notice above does not evidence any - * actual or intended publication of such source code. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "twosigma.h" #include @@ -17,35 +20,33 @@ #include #include #include -#include "tsutil.h" -#include "tsio.h" -#include "tsutil_dcat.h" +#include +#include "tslog.h" #include "iqueue.h" -#include "iqueue_cpio.h" #include "tsclock.h" -#include "segfault.h" +#include "io_utils.h" static struct option long_options[] = { { "help", no_argument, 0, '?' }, { "iqueue", required_argument, 0, 'f' }, { "create", no_argument, 0, 'C' }, - { "header", no_argument, 0, 'H' }, - { "stats", no_argument, 0, 's' }, - { "watch", no_argument, 0, 'w' }, - { "append", no_argument, 0, 'a' }, + { "header", no_argument, 0, 'H' }, + { "stats", no_argument, 0, 's' }, + { "watch", no_argument, 0, 'w' }, + { "append", no_argument, 0, 'a' }, { "line", no_argument, 0, 'l' }, - { "follow", no_argument, 0, 'F' }, - { "seal", no_argument, 0, 'S' }, - { "archive", no_argument, 0, 'A' }, - { "zero", no_argument, 0, '0' }, + { "follow", no_argument, 0, 'F' }, + { "seal", no_argument, 0, 'S' }, + { "archive", no_argument, 0, 'A' }, + { "zero", no_argument, 0, '0' }, { "binary", no_argument, 0, 'b' }, - { "no-header", no_argument, 0, 'N' }, + { "no-header", no_argument, 0, 'N' }, { "copyin", no_argument, 0, '1' }, { "copyout", no_argument, 0, '2' }, { "writer", required_argument, 0, 'W' }, { "print-entry", required_argument, 0, 'n' }, - { "begin", required_argument, 0, 'B' }, - { "end", required_argument, 0, 'E' }, + { "begin", required_argument, 0, 'B' }, + { "end", required_argument, 0, 'E' }, { "debug", required_argument, 0, 'd' }, { 0, 0, 0, 0}, }; @@ -79,8 +80,8 @@ usage( " -A | --archive When sealing, archive the file\n" " -N | --no-header Do not print the user header\n" " -n | --print-entry N Print only entry number N\n" -" -B | --begin N Start from entry N\n" -" -E | --end N End with entry N\n" +" -B | --begin N Start from entry N\n" +" -E | --end N End with entry N\n" " -d | --debug N Debug entry N (or the entire queue if N==-1)\n" " -b | --binary Print binary messages\n" " -0 | -z | --zero Print nul-separated messages in ascii mode\n" @@ -102,6 +103,38 @@ usage( } +int +iqueue_reopen_wait( + iqueue_t * const iq +) +{ + const uint64_t creation = iqueue_creation(iq); + TSLOGXL(TSINFO, "%s: Waiting for new queue", iqueue_name(iq)); + + while (1) + { + // Try to re-open it until we have a file + if (iqueue_reopen(iq) < 0) + { + if (errno != ENOENT) + return -1; + + usleep(50000); + continue; + } + + // We have a real file; see if it is the same one + if (iqueue_creation(iq) == creation) + { + sleep(1); + continue; + } + + // New iqueue, new creation. We're done + return 0; + } +} + static ssize_t read_all( int fd, @@ -113,20 +146,20 @@ read_all( while (offset < len) { - ssize_t rc = read(fd, buf+offset, len-offset); - if (rc < 0) - { - TSLOG(TSERROR, "read failed"); - return -1; - } + ssize_t rc = read(fd, buf+offset, len-offset); + if (rc < 0) + { + TSLOGL(TSERROR, "read failed"); + return -1; + } - offset += rc; + offset += rc; - if (rc == 0) - return offset; + if (rc == 0) + return offset; } - TSLOGX(TSERROR, "message too long! limit is %zu bytes", len); + TSLOGXL(TSERROR, "message too long! limit is %zu bytes", len); return -1; } @@ -139,23 +172,23 @@ iqueue_append_one( ) { if (iqueue_is_sealed(iq)) - TSABORTX("can not append: iqueue is sealed"); + TSABORTX("can not append: iqueue is sealed"); uint8_t * const buf = calloc(1, IQUEUE_MSG_MAX); const size_t max_size = zero_out ? IQUEUE_MSG_MAX - 1 : IQUEUE_MSG_MAX; ssize_t len = read_all(fd, buf, max_size); if (len < 0) - TSABORTX("Error reading from stdin!"); + TSABORTX("Error reading from stdin!"); if (zero_out) - buf[len++] = '\0'; + buf[len++] = '\0'; int rc = iqueue_append(iq, buf, len); free(buf); if (rc != 0) { - TSLOGX(TSERROR, "%s: Update failed rc=%d", iqueue_name(iq), rc); - return EXIT_FAILURE; + TSLOGXL(TSERROR, "%s: Update failed rc=%d", iqueue_name(iq), rc); + return EXIT_FAILURE; } return EXIT_SUCCESS; @@ -180,24 +213,24 @@ read_entire_line( while (off < len-1) { - ssize_t rlen = read(fd, &buf[off], 1); - if (rlen < 0) - return rlen; - - if (rlen == 0) - { - // Closed file and no partial line - if (off == 0) - return -1; - - // Partial line read - off++; - break; - } - - // Check for end of line - if (separator == buf[off++]) - break; + ssize_t rlen = read(fd, &buf[off], 1); + if (rlen < 0) + return rlen; + + if (rlen == 0) + { + // Closed file and no partial line + if (off == 0) + return -1; + + // Partial line read + off++; + break; + } + + // Check for end of line + if (separator == buf[off++]) + break; } // nul terminate the string, just in case and return the length @@ -221,27 +254,27 @@ iqueue_append_octdump( for (size_t i = 0 ; i < len ; i++) { - char c = buf[i]; - if (c != '\\') - { - m[out_len++] = c; - continue; - } - - if (buf[i+1] == '\\') - { - m[out_len++] = '\\'; - i++; - continue; - } - - // Convert from an octal dump to hex - uint8_t o = 0; - o = (buf[++i] - '0') | (o << 3); - o = (buf[++i] - '0') | (o << 3); - o = (buf[++i] - '0') | (o << 3); - - m[out_len++] = o; + char c = buf[i]; + if (c != '\\') + { + m[out_len++] = c; + continue; + } + + if (buf[i+1] == '\\') + { + m[out_len++] = '\\'; + i++; + continue; + } + + // Convert from an octal dump to hex + uint8_t o = 0; + o = (buf[++i] - '0') | (o << 3); + o = (buf[++i] - '0') | (o << 3); + o = (buf[++i] - '0') | (o << 3); + + m[out_len++] = o; } // Resize the iqmsg, throwing away some space at the end @@ -250,16 +283,6 @@ iqueue_append_octdump( return iqueue_update(iq, iqmsg, NULL); } - -static int -iqueue_copyin_from_stdin( - const char * const filename, - const bool do_follow -) -{ - return iqueue_create_and_copyin_from_file(filename, do_follow, stdin); -} - static int iqueue_append_line( iqueue_t * const iq, @@ -270,47 +293,47 @@ iqueue_append_line( { if (iqueue_is_sealed(iq)) { - if (!do_follow) - TSABORTX("%s: can not append: iqueue is sealed", iqueue_name(iq)); - if (iqueue_reopen_wait(iq) < 0) - TSABORTX("%s: reopen failed", iqueue_name(iq)); + if (!do_follow) + TSABORTX("%s: can not append: iqueue is sealed", iqueue_name(iq)); + if (iqueue_reopen_wait(iq) < 0) + TSABORTX("%s: reopen failed", iqueue_name(iq)); } char * const buf = calloc(1, IQUEUE_MSG_MAX); while (1) { - int rc; - ssize_t len = read_entire_line(fd, buf, IQUEUE_MSG_MAX, zero_separator ? '\0' : '\n'); - if (len == -1) - break; - - // Do not include any newlines if we have them - if (zero_separator) - len--; - else - if (buf[len-1] == '\n') - buf[--len] = '\0'; + int rc; + ssize_t len = read_entire_line(fd, buf, IQUEUE_MSG_MAX, zero_separator ? '\0' : '\n'); + if (len == -1) + break; + + // Do not include any newlines if we have them + if (zero_separator) + len--; + else + if (buf[len-1] == '\n') + buf[--len] = '\0'; retry: - rc = iqueue_append_octdump(iq, buf, len); - if (rc == 0) - continue; - - if (rc != IQUEUE_STATUS_SEALED && !do_follow) - { - TSLOGX(TSERROR, "%s: Update failed rc=%d", iqueue_name(iq), rc); - return EXIT_FAILURE; - } - - // We have a sealed iqueue; try to reopen until we have a new one - if (iqueue_reopen_wait(iq) < 0) - { - TSLOG(TSERROR, "%s: Unable to reopen", iqueue_name(iq)); - return EXIT_FAILURE; - } - - goto retry; + rc = iqueue_append_octdump(iq, buf, len); + if (rc == 0) + continue; + + if (rc != IQUEUE_STATUS_SEALED && !do_follow) + { + TSLOGXL(TSERROR, "%s: Update failed rc=%d", iqueue_name(iq), rc); + return EXIT_FAILURE; + } + + // We have a sealed iqueue; try to reopen until we have a new one + if (iqueue_reopen_wait(iq) < 0) + { + TSLOGL(TSERROR, "%s: Unable to reopen", iqueue_name(iq)); + return EXIT_FAILURE; + } + + goto retry; } free(buf); @@ -325,40 +348,40 @@ iqueue_stats_output( ) { printf("%s:" - " %"PRIu64" (0x%"PRIx64")" - " data=%"PRIu64 - " entries=%"PRIu64 - "%s" - "\n", - iqueue_name(iq), - iqueue_creation(iq), - iqueue_creation(iq), - iqueue_data_len(iq), - iqueue_entries(iq), - iqueue_is_sealed(iq) ? " sealed" : "" + " %"PRIu64" (0x%"PRIx64")" + " data=%"PRIu64 + " entries=%"PRIu64 + "%s" + "\n", + iqueue_name(iq), + iqueue_creation(iq), + iqueue_creation(iq), + iqueue_data_len(iq), + iqueue_entries(iq), + iqueue_is_sealed(iq) ? " sealed" : "" ); for (unsigned id = 0 ; id < 4 ; id++) { - shash_t * const sh = iqueue_writer_table(iq, id, 0); - if (!sh) - continue; - - unsigned max_entries; - const shash_entry_t * const table = shash_entries(sh, &max_entries); - - printf("Writers %u:\n", id); - - for (unsigned i = 0 ; i < max_entries ; i++) - { - const shash_entry_t * const writer = &table[i]; - if (writer->key == 0) - break; - printf(" %"PRIx64": %"PRIu64"\n", - writer->key, - writer->value - ); - } + shash_t * const sh = iqueue_writer_table(iq, id, 0); + if (!sh) + continue; + + unsigned max_entries; + const shash_entry_t * const table = shash_entries(sh, &max_entries); + + printf("Writers %u:\n", id); + + for (unsigned i = 0 ; i < max_entries ; i++) + { + const shash_entry_t * const writer = &table[i]; + if (writer->key == 0) + break; + printf(" %"PRIx64": %"PRIu64"\n", + writer->key, + writer->value + ); + } } return 0; @@ -375,35 +398,35 @@ iqueue_watch_output( uint64_t old_entries = iqueue_entries(iq); printf("%s: creation %"PRIu64" (0x%"PRIx64"): %"PRIu64" entries\n", - iqueue_name(iq), - iqueue_creation(iq), - iqueue_creation(iq), - old_entries + iqueue_name(iq), + iqueue_creation(iq), + iqueue_creation(iq), + old_entries ); while (1) { - usleep(sleep_us); - const uint64_t new_time = tsclock_getnanos(0); - const uint64_t new_entries = iqueue_entries(iq); - if (new_entries == old_entries) - { - if (iqueue_is_sealed(iq)) - break; - continue; - } - - printf("%s: %"PRIu64" entries (%.0f entries/sec)\n", - iqueue_name(iq), - new_entries, - (new_entries - old_entries) * 1.0e9 / (new_time - old_time) - ); - - old_time = new_time; - old_entries = new_entries; + usleep(sleep_us); + const uint64_t new_time = tsclock_getnanos(0); + const uint64_t new_entries = iqueue_entries(iq); + if (new_entries == old_entries) + { + if (iqueue_is_sealed(iq)) + break; + continue; + } + + printf("%s: %"PRIu64" entries (%.0f entries/sec)\n", + iqueue_name(iq), + new_entries, + (new_entries - old_entries) * 1.0e9 / (new_time - old_time) + ); + + old_time = new_time; + old_entries = new_entries; } - TSLOGX(TSINFO, "%s: iqueue has been sealed", iqueue_name(iq)); + TSLOGXL(TSINFO, "%s: iqueue has been sealed", iqueue_name(iq)); return 0; } @@ -419,22 +442,22 @@ iqueue_seal_and_archive( int rc = iqueue_seal(iq); if (rc != 0) { - TSLOGX(TSERROR, "%s: Unable to seal: %s", - old_name, - rc == IQUEUE_STATUS_SEALED ? "already sealed" : - rc == IQUEUE_STATUS_INDEX_INVALID ? "invalid index" : - "unknown error" - ); - return EXIT_FAILURE; + TSLOGXL(TSERROR, "%s: Unable to seal: %s", + old_name, + rc == IQUEUE_STATUS_SEALED ? "already sealed" : + rc == IQUEUE_STATUS_INDEX_INVALID ? "invalid index" : + "unknown error" + ); + return EXIT_FAILURE; } if (!do_archive) - return EXIT_SUCCESS; + return EXIT_SUCCESS; if (iqueue_archive(iq, IQUEUE_MSG_BAD_ID) < 0) - return EXIT_FAILURE; + return EXIT_FAILURE; - TSLOGX(TSINFO, "%s: archived", iqueue_name(iq)); + TSLOGXL(TSINFO, "%s: archived", iqueue_name(iq)); return EXIT_SUCCESS; } @@ -449,59 +472,59 @@ iqueue_update_writer( char * end; const uint64_t id = strtoul(writer_flag, &end, 0); if (!end || end[0] != ',' || end[1] == '\0') - usage(stderr, "Unable to parse writer, must be 'N,V'\n"); + usage(stderr, "Unable to parse writer, must be 'N,V'\n"); const uint64_t value = strtoul(end+1, &end, 0); if (!end || end[0] != '\0') - usage(stderr, "Unable to parse value, must be 'N,V'\n"); + usage(stderr, "Unable to parse value, must be 'N,V'\n"); const unsigned table_id = 0; shash_t * const sh = iqueue_writer_table(iq, table_id, 1); if (!sh) - TSABORTX("%s: Unable to create/retrieve write table %u?", - iqueue_name(iq), - table_id - ); + TSABORTX("%s: Unable to create/retrieve write table %u?", + iqueue_name(iq), + table_id + ); shash_entry_t * writer = shash_insert(sh, id, value); if (writer) { - // Writer did not exist; we are done. - TSLOGX(TSINFO, "%s: Writer %u.0x%"PRIx64" value %"PRIu64, - iqueue_name(iq), - table_id, - id, - value - ); - return 0; + // Writer did not exist; we are done. + TSLOGXL(TSINFO, "%s: Writer %u.0x%"PRIx64" value %"PRIu64, + iqueue_name(iq), + table_id, + id, + value + ); + return 0; } // Writer already existed. Retrieve it and try an update writer = shash_get(sh, id); if (!writer) - TSABORTX("%s: Writer %u.0x%"PRIx64" should exist?", - iqueue_name(iq), - table_id, - id - ); + TSABORTX("%s: Writer %u.0x%"PRIx64" should exist?", + iqueue_name(iq), + table_id, + id + ); if (iqueue_writer_update(sh, writer, value)) { - TSLOGX(TSINFO, "%s: Writer %u.0x%"PRIx64" value %"PRIu64, - iqueue_name(iq), - table_id, - id, - value - ); - return 0; + TSLOGXL(TSINFO, "%s: Writer %u.0x%"PRIx64" value %"PRIu64, + iqueue_name(iq), + table_id, + id, + value + ); + return 0; } - TSLOGX(TSWARN, "%s: Writer %u.0x%"PRIx64" tried to write %"PRIu64", current value %"PRIu64, - iqueue_name(iq), - table_id, - id, - value, - writer->value + TSLOGXL(TSWARN, "%s: Writer %u.0x%"PRIx64" tried to write %"PRIu64", current value %"PRIu64, + iqueue_name(iq), + table_id, + id, + value, + writer->value ); return 0; @@ -514,8 +537,6 @@ main( char **argv ) { - segfault_handler_install(); - const char * iqueue_file = NULL; int create_flag = 0; int writable = 0; @@ -535,190 +556,181 @@ main( int do_stats = 0; int do_watch = 0; int read_header = 0; - int copyin_flag = 0; - int copyout_flag = 0; const char * writer_flag = NULL; while (1) { - int c = getopt_long( - argc, - argv, - "h?f:CGaFz0bAHI:D:n:B:E:swNl12W:", - long_options, - &option_index - ); - - if (c == -1) - break; - - switch (c) - { - case 0: break; - default: usage(stderr, ""); break; - case 'h': case '?': usage(stdout, ""); break; - - // Messagebox options - case 'f': iqueue_file = optarg; break; - case 'C': create_flag = 1; break; - case 'n': print_entry = strtoul(optarg, NULL, 0); break; - case 'd': debug_entry = strtoul(optarg, NULL, 0); break; - case 'B': begin_entry = strtoul(optarg, NULL, 0); break; - case 'E': end_entry = strtoul(optarg, NULL, 0); break; - case 'a': append = 1; writable = true; break; - case 'l': append_line = 1; writable = true; break; - case 'F': follow = 1; break; - case 'S': do_seal = 1; writable = true; break; - case 'A': do_archive = 1; break; - case 'z': - case '0': zero_out = 1; break; - case 'b': binary_out = 1; break; - case 'N': header = 0; break; - case 'H': read_header = 1; break; - case 's': do_stats = 1; break; - case 'w': do_watch = 1; break; - case '1': copyin_flag = 1; writable = true; break; - case '2': copyout_flag = 1; break; - case 'W': writer_flag = optarg; writable = true; break; - } + int c = getopt_long( + argc, + argv, + "h?f:CGaFz0bAHI:D:n:B:E:swNlW:", + long_options, + &option_index + ); + + if (c == -1) + break; + + switch (c) + { + case 0: break; + default: usage(stderr, ""); break; + case 'h': case '?': usage(stdout, ""); break; + + // Messagebox options + case 'f': iqueue_file = optarg; break; + case 'C': create_flag = 1; break; + case 'n': print_entry = strtoul(optarg, NULL, 0); break; + case 'd': debug_entry = strtoul(optarg, NULL, 0); break; + case 'B': begin_entry = strtoul(optarg, NULL, 0); break; + case 'E': end_entry = strtoul(optarg, NULL, 0); break; + case 'a': append = 1; writable = true; break; + case 'l': append_line = 1; writable = true; break; + case 'F': follow = 1; break; + case 'S': do_seal = 1; writable = true; break; + case 'A': do_archive = 1; break; + case 'z': + case '0': zero_out = 1; break; + case 'b': binary_out = 1; break; + case 'N': header = 0; break; + case 'H': read_header = 1; break; + case 's': do_stats = 1; break; + case 'w': do_watch = 1; break; + case 'W': writer_flag = optarg; writable = true; break; + } } if (!iqueue_file) - usage(stderr, "iqueue file must be specified!\n"); + usage(stderr, "iqueue file must be specified!\n"); if (argc != optind) - usage(stderr, "Extra arguments?\n"); + usage(stderr, "Extra arguments?\n"); uint8_t * user_hdr = NULL; size_t user_hdr_len = 0; if (read_header) { - if (create_flag != 1) - usage(stderr, "--read-header is not useful unless creating\n"); - user_hdr = alloca(65536); - user_hdr_len = read_all(STDIN_FILENO, user_hdr, 65536); - if (user_hdr_len == (size_t) -1) - TSABORTX("Error reading user header"); + if (create_flag != 1) + usage(stderr, "--read-header is not useful unless creating\n"); + user_hdr = alloca(65536); + user_hdr_len = read_all(STDIN_FILENO, user_hdr, 65536); + if (user_hdr_len == (size_t) -1) + TSABORTX("Error reading user header"); } - if (copyin_flag) - return iqueue_copyin_from_stdin(iqueue_file, follow); - if (create_flag) { - const uint64_t creation = tsclock_getnanos(0); - iqueue_t * const iq = iqueue_create( - iqueue_file, - creation, - user_hdr, - user_hdr_len - ); - if (!iq) - TSABORTX("%s: Unable to create", iqueue_file); - if (iqueue_creation(iq) != creation) - TSLOGX(TSINFO, "%s: iqueue already existed", iqueue_file); - return EXIT_SUCCESS; + const uint64_t creation = tsclock_getnanos(0); + iqueue_t * const iq = iqueue_create( + iqueue_file, + creation, + user_hdr, + user_hdr_len + ); + if (!iq) + TSABORTX("%s: Unable to create", iqueue_file); + if (iqueue_creation(iq) != creation) + TSLOGXL(TSINFO, "%s: iqueue already existed", iqueue_file); + return EXIT_SUCCESS; } iqueue_t * const iq = iqueue_open(iqueue_file, writable); if (!iq) - TSABORTX("Failed to %s %s", - create_flag == 1 ? "create" : "open", - iqueue_file - ); + TSABORTX("Failed to %s %s", + create_flag == 1 ? "create" : "open", + iqueue_file + ); if (writer_flag) - return iqueue_update_writer(iq, writer_flag); + return iqueue_update_writer(iq, writer_flag); if (do_stats) - return iqueue_stats_output(iq); + return iqueue_stats_output(iq); if (do_watch) - return iqueue_watch_output(iq, 1e6); - if (copyout_flag) - return iqueue_copyout(iq, follow, begin_entry, end_entry); + return iqueue_watch_output(iq, 1e6); if (append) - return iqueue_append_one(iq, STDIN_FILENO, zero_out); + return iqueue_append_one(iq, STDIN_FILENO, zero_out); if (append_line) - return iqueue_append_line(iq, STDIN_FILENO, zero_out, follow); + return iqueue_append_line(iq, STDIN_FILENO, zero_out, follow); if (do_seal) - return iqueue_seal_and_archive(iq, do_archive); + return iqueue_seal_and_archive(iq, do_archive); else if (do_archive) - return iqueue_archive(iq, IQUEUE_MSG_BAD_ID); + return iqueue_archive(iq, IQUEUE_MSG_BAD_ID); if (debug_entry != (uint64_t) -2) { - iqueue_debug(iq, debug_entry); - return EXIT_SUCCESS; + iqueue_debug(iq, debug_entry); + return EXIT_SUCCESS; } // Print the user header on the file, if there is one if (header) { - size_t hdr_len; - const uint8_t * const hdr_buf = iqueue_header(iq, &hdr_len); - size_t offset = 0; - while (offset < hdr_len) - { - ssize_t wlen = write( - STDOUT_FILENO, - hdr_buf + offset, - hdr_len - offset - ); - if (wlen <= 0) - TSABORT("header write failed"); - offset += wlen; - } + size_t hdr_len; + const uint8_t * const hdr_buf = iqueue_header(iq, &hdr_len); + size_t offset = 0; + while (offset < hdr_len) + { + ssize_t wlen = write( + STDOUT_FILENO, + hdr_buf + offset, + hdr_len - offset + ); + if (wlen <= 0) + TSABORT("header write failed"); + offset += wlen; + } } uint64_t id = begin_entry; if (print_entry != (uint64_t) -1) - id = print_entry; + id = print_entry; while (1) { - if (end_entry != (uint64_t) -1 && id > end_entry) - break; - size_t len; - const uint8_t * data = iqueue_data(iq, id, &len); - if (!data) - { - if (!follow) - break; - if (iqueue_is_sealed(iq)) - break; - usleep(100); - continue; - } - - id++; - - if (binary_out) - { - ssize_t wlen = tsio_write_all(STDOUT_FILENO, data, len); - if ((size_t) wlen != len) - TSABORT("write failed"); - } else { - // ASCII output - for (uint64_t i = 0 ; i < len ; i++) - { - uint8_t c = data[i]; - if (c == '\\') - printf("\\\\"); - else - if (isprint(c)) // || isspace(c)) - printf("%c", c); - else - printf("\\%03o", c); - } - - printf("%c", zero_out ? '\0' : '\n'); - } - - // If they have called us with print entry, we are done - if (print_entry != (uint64_t) -1) - break; + if (end_entry != (uint64_t) -1 && id > end_entry) + break; + size_t len; + const uint8_t * data = iqueue_data(iq, id, &len); + if (!data) + { + if (!follow) + break; + if (iqueue_is_sealed(iq)) + break; + usleep(100); + continue; + } + + id++; + + if (binary_out) + { + ssize_t wlen = write_all(STDOUT_FILENO, data, len); + if ((size_t) wlen != len) + TSABORT("write failed"); + } else { + // ASCII output + for (uint64_t i = 0 ; i < len ; i++) + { + uint8_t c = data[i]; + if (c == '\\') + printf("\\\\"); + else + if (isprint(c)) // || isspace(c)) + printf("%c", c); + else + printf("\\%03o", c); + } + + printf("%c", zero_out ? '\0' : '\n'); + } + + // If they have called us with print entry, we are done + if (print_entry != (uint64_t) -1) + break; } iqueue_close(iq); diff --git a/src/iqueue.c b/src/iqueue.c index a34ef0a..e4eeee9 100644 --- a/src/iqueue.c +++ b/src/iqueue.c @@ -1,23 +1,30 @@ -/* $TwoSigma: iqueue.c,v 1.30 2012/01/05 21:45:21 thudson Exp $ */ - /* - * Copyright (c) 2010 Two Sigma Investments, LLC - * All Rights Reserved + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF - * Two Sigma Investments, LLC. + * http://www.apache.org/licenses/LICENSE-2.0 * - * The copyright notice above does not evidence any - * actual or intended publication of such source code. + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #include "twosigma.h" +#include +#include +#include #include #include #include #include #include #include +#include #include #include #include @@ -28,29 +35,25 @@ #include #include #include -#include "tsutil.h" +#include "tslog.h" #include "tsclock.h" #include "iqueue.h" #include "iqsync.h" -#include "atomic.h" -#include "tslock.h" #include "shash.h" +#include -__RCSID("$TwoSigma: iqueue.c,v 1.30 2012/01/05 21:45:21 thudson Exp $"); - - -#define IQUEUE_INDEX_MAGIC ((uint32_t) 0xDADA1234) -#define IQUEUE_VERSION ((uint32_t) 0x00003000) -#define IQUEUE_BLOCK_MAGIC ((uint64_t) 0x6971626c6f636b00) -#define IQUEUE_MAX_HDR_LEN ((size_t) 4096) +#define IQUEUE_INDEX_MAGIC ((uint32_t) 0xDADA1234) +#define IQUEUE_VERSION ((uint32_t) 0x00003000) +#define IQUEUE_BLOCK_MAGIC ((uint64_t) 0x6971626c6f636b00) +#define IQUEUE_MAX_HDR_LEN ((size_t) 4096) -#define IQUEUE_TABLE_SHIFT 20 -#define IQUEUE_TABLE_MASK ((1 << IQUEUE_TABLE_SHIFT) - 1) -#define IQUEUE_TABLE_SIZE (1 << 20) +#define IQUEUE_TABLE_SHIFT 20 +#define IQUEUE_TABLE_MASK ((1 << IQUEUE_TABLE_SHIFT) - 1) +#define IQUEUE_TABLE_SIZE (1 << 20) -#define IQUEUE_WRITER_TABLES 4 +#define IQUEUE_WRITER_TABLES 4 -#define IQUEUE_BLOCK_COUNT (1024) +#define IQUEUE_BLOCK_COUNT (1024) typedef struct @@ -63,7 +66,7 @@ typedef struct // pointers to/size of the writer tables. Must be at least // 8-byte aligned to ensure that it does not cross a cache line. const iqueue_msg_t writer_tables[IQUEUE_WRITER_TABLES] - __attribute__((aligned(8))); + __attribute__((aligned(8))); volatile iqueue_id_t index_tail __attribute__((aligned(64))); volatile uint64_t data_tail __attribute__((aligned(64))); @@ -141,7 +144,7 @@ _iqueue_flock( ) { if (iq->last_grow_size == 0) - return 1; + return 1; iqueue_index_t * const idx = iq->idx; @@ -149,52 +152,54 @@ _iqueue_flock( // the process that has it locked, in which case we can then check // the flock time field. if (flock(iq->fd, LOCK_EX) < 0) - return -1; + return -1; // Check to see the file lock time uint64_t now = tsclock_getnanos(0); - uint64_t old_flock_time = atomic_cas_64(&idx->flock_time, 0, now); + uint64_t const zero = 0; // If there was no file lock time set, and we wrote our time to it, // then we have the locks and are ready to proceed. - if (old_flock_time == 0) - return now; + if (atomic_compare_exchange_strong(&idx->flock_time, &zero, now)) + return now; - TSLOGX(TSINFO, "%s: lock held since %"PRIu64" (now=%"PRIu64")", - iq->name, - old_flock_time, - now + uint64_t old_flock_time = idx->flock_time; + + TSLOGXL(TSINFO, "%s: lock held since %"PRIu64" (now=%"PRIu64")", + iq->name, + old_flock_time, + now ); // Spin for up to 1 msec or until the flock_time changes. const uint64_t flock_timeout = 1000000; while (now < old_flock_time + flock_timeout) { - // If it changes, that means that another thread in our process - // has finished its business and the lock conditions should be - // rechecked to see if it even matters any more. - if (idx->flock_time != old_flock_time) - { - flock(iq->fd, LOCK_UN); - TSLOGX(TSDEBUG, "%s: Lock is available again", iq->name); - return 0; - } - - usleep(10); - now = tsclock_getnanos(0); + // If it changes, that means that another thread in our process + // has finished its business and the lock conditions should be + // rechecked to see if it even matters any more. + if (idx->flock_time != old_flock_time) + { + flock(iq->fd, LOCK_UN); + TSLOGXL(TSDEBUG, "%s: Lock is available again", iq->name); + return 0; + } + + usleep(10); + now = tsclock_getnanos(0); } // Someone else has held the lock for more than our timeout, // which means they are likely dead. Steal the lock from // them if we can. - if (atomic_cas_bool_64(&idx->flock_time, old_flock_time, now)) + if (atomic_compare_exchange_strong(&idx->flock_time, &old_flock_time, now)) { - TSLOGX(TSWARN, "%s: Stole lock after timeout", iq->name); - return now; + TSLOGXL(TSWARN, "%s: Stole lock after timeout", iq->name); + return now; } flock(iq->fd, LOCK_UN); - TSLOGX(TSDEBUG, "%s: Lock is available again", iq->name); + TSLOGXL(TSDEBUG, "%s: Lock is available again", iq->name); return 0; } @@ -206,10 +211,10 @@ _iqueue_funlock( ) { if (iq->last_grow_size == 0) - return; + return; - if (!atomic_cas_bool_64(&iq->idx->flock_time, lock_time, 0)) - TSLOGX(TSWARN, "%s: Lock was stolen from us! Danger!", iq->name); + if (!atomic_compare_exchange_strong(&iq->idx->flock_time, &lock_time, 0)) + TSLOGXL(TSWARN, "%s: Lock was stolen from us! Danger!", iq->name); flock(iq->fd, LOCK_UN); } @@ -233,57 +238,57 @@ iqueue_grow_file( struct stat sb; retry: if (fstat(iq->fd, &sb) < 0) - goto fail; + goto fail; // Once we (or someone) have been successful in growing the file, // we're done and can return. if (sb.st_size >= (off_t) new_size) { - TSLOGX(TSDEBUG, "%s: File is already %"PRIu64" bytes >= %"PRIu64, - iq->name, - (uint64_t) sb.st_size, - new_size - ); - return 0; + TSLOGXL(TSDEBUG, "%s: File is already %"PRIu64" bytes >= %"PRIu64, + iq->name, + (uint64_t) sb.st_size, + new_size + ); + return 0; } // For the first block, we can't lock since the file doesn't exist. const uint64_t flock_time = _iqueue_flock(iq); if (flock_time == (uint64_t) -1) - goto fail; + goto fail; if (flock_time == 0) - goto retry; + goto retry; // Double check the file size, just in case // in between us checking the size and then getting the lock, // someone else has grown the file. if (fstat(iq->fd, &sb) < 0) { - _iqueue_funlock(iq, flock_time); - goto fail; + _iqueue_funlock(iq, flock_time); + goto fail; } if (sb.st_size >= (off_t) new_size) { - _iqueue_funlock(iq, flock_time); - TSLOGX(TSDEBUG, "%s: Someone else grew the file to %"PRIu64, - iq->name, - new_size - ); - return 0; + _iqueue_funlock(iq, flock_time); + TSLOGXL(TSDEBUG, "%s: Someone else grew the file to %"PRIu64, + iq->name, + new_size + ); + return 0; } - TSLOGX(TSINFO, "%s: Growing from 0x%"PRIx64" to 0x%"PRIx64" bytes", - iq->name, - (uint64_t) sb.st_size, - new_size + TSLOGXL(TSDEBUG, "%s: Growing from 0x%"PRIx64" to 0x%"PRIx64" bytes", + iq->name, + (uint64_t) sb.st_size, + new_size ); if (ftruncate(iq->fd, new_size) < 0) { - _iqueue_funlock(iq, flock_time); - goto fail; + _iqueue_funlock(iq, flock_time); + goto fail; } _iqueue_funlock(iq, flock_time); @@ -291,9 +296,9 @@ iqueue_grow_file( return 0; fail: - TSLOG(TSERROR, "%s: Failed to grow to %"PRIu64" bytes", - iq->name, - new_size + TSLOGL(TSERROR, "%s: Failed to grow to %"PRIu64" bytes", + iq->name, + new_size ); return -1; @@ -309,22 +314,22 @@ iqueue_mlock_block( { if (block_id >= IQUEUE_BLOCK_COUNT) { - TSLOGX(TSWARN, "%s: Block %"PRIu64" out of range", iq->name, block_id); - return -1; + TSLOGXL(TSWARN, "%s: Block %"PRIu64" out of range", iq->name, block_id); + return -1; } void * const block = iq->blocks[block_id]; if (!block) - return 0; + return 0; if (mlock(block, IQUEUE_BLOCK_SIZE) == 0) - return 0; + return 0; - TSLOG(TSWARN, "%s: Unable to mlock(block[%"PRIu64"]=%p,0x%"PRIx64")", - iq->name, - block_id, - block, - IQUEUE_BLOCK_SIZE + TSLOGL(TSWARN, "%s: Unable to mlock(block[%"PRIu64"]=%p,0x%"PRIx64")", + iq->name, + block_id, + block, + IQUEUE_BLOCK_SIZE ); iq->mlock_flag = 0; @@ -343,43 +348,43 @@ iqueue_fsync_block( { if (block_id >= IQUEUE_BLOCK_COUNT) { - TSLOGX(TSWARN, "%s: Block %"PRIu64" out of range", iq->name, block_id); - return -1; + TSLOGXL(TSWARN, "%s: Block %"PRIu64" out of range", iq->name, block_id); + return -1; } void * const block = iq->blocks[block_id]; if (!block) - return -1; + return -1; const uint64_t block_offset = block_id << IQUEUE_BLOCK_SHIFT; // First sync contents of block to ensure any dirty pages in our mapping // are saved back to the file if (sync_file_range( - iq->fd, - block_offset, - IQUEUE_BLOCK_SIZE, - SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE) != 0) { - TSLOG(TSWARN, "%s: Unable to fsync(block[%"PRIu64"]=%p,0x%"PRIx64")", - iq->name, - block_id, - block, - IQUEUE_BLOCK_SIZE - ); - return -1; + iq->fd, + block_offset, + IQUEUE_BLOCK_SIZE, + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE) != 0) { + TSLOGL(TSWARN, "%s: Unable to fsync(block[%"PRIu64"]=%p,0x%"PRIx64")", + iq->name, + block_id, + block, + IQUEUE_BLOCK_SIZE + ); + return -1; } // Now free the VMAs so Linux will reclaim teh pages (unless we access // it again) if (madvise(block, IQUEUE_BLOCK_SIZE, MADV_DONTNEED) != 0) { - TSLOG(TSWARN, "%s: Unable to madvise(block[%"PRIu64"]=%p,0x%"PRIx64 - ", MADV_DONTNEED)", - iq->name, - block_id, - block, - IQUEUE_BLOCK_SIZE - ); - return -1; + TSLOGL(TSWARN, "%s: Unable to madvise(block[%"PRIu64"]=%p,0x%"PRIx64 + ", MADV_DONTNEED)", + iq->name, + block_id, + block, + IQUEUE_BLOCK_SIZE + ); + return -1; } return 0; @@ -401,19 +406,19 @@ _iqueue_map_block( { // If we do not have a file, we can not do any mappings if (iq->fd < 0) - return NULL; + return NULL; // Make sure the file is at least as large as this block size const uint64_t min_size = (block_id + 2) << IQUEUE_BLOCK_SHIFT; if (iqueue_grow_file(iq, min_size) < 0) - return NULL; + return NULL; // Quick check to see if any one else has already done so uint8_t * block = iq->blocks[block_id]; if (block) { - TSLOGX(TSDEBUG, "%s: block %"PRIu64" already mapped to %p", iq->name, block_id, block); - return block; + TSLOGXL(TSDEBUG, "%s: block %"PRIu64" already mapped to %p", iq->name, block_id, block); + return block; } // Attempt to map it @@ -421,46 +426,47 @@ _iqueue_map_block( uint64_t map_time = -tsclock_getnanos(0); block = mmap( - NULL, - IQUEUE_BLOCK_SIZE, - iq->mmap_prot, - iq->mmap_flags, - iq->fd, - block_offset + NULL, + IQUEUE_BLOCK_SIZE, + iq->mmap_prot, + iq->mmap_flags, + iq->fd, + block_offset ); if (block == MAP_FAILED) { - TSLOGX(TSERROR, "%s: Failed to map offset %"PRIu64, iq->name, block_offset); - return NULL; + TSLOGXL(TSERROR, "%s: Failed to map offset %"PRIu64, iq->name, block_offset); + return NULL; } map_time += tsclock_getnanos(0); // Attempt to write the mapping into the local block table - if (!atomic_cas_bool_ptr((volatile void*) &iq->blocks[block_id], NULL, block)) + uint8_t *const null = NULL; + if (!atomic_compare_exchange_strong(&iq->blocks[block_id], &null, block)) { - // We lost! Someone else beat us to it. Deallocate our block - // and use theirs instead. Sucks to be us. - TSLOGX(TSDEBUG, - "%s: Lost race. Unmapping %"PRIx64" from %p", - iq->name, - block_offset, - block - ); - - munmap(block, IQUEUE_BLOCK_SIZE); - return iq->blocks[block_id]; + // We lost! Someone else beat us to it. Deallocate our block + // and use theirs instead. Sucks to be us. + TSLOGXL(TSDEBUG, + "%s: Lost race. Unmapping %"PRIx64" from %p", + iq->name, + block_offset, + block + ); + + munmap(block, IQUEUE_BLOCK_SIZE); + return iq->blocks[block_id]; } - TSLOGX(TSINFO, "%s: Mapped 0x%"PRIx64" to %p in %"PRIu64" ns", - iq->name, - block_offset, - block, - map_time + TSLOGXL(TSDEBUG, "%s: Mapped 0x%"PRIx64" to %p in %"PRIu64" ns", + iq->name, + block_offset, + block, + map_time ); if (iq->mlock_flag) - iqueue_mlock_block(iq, block_id); + iqueue_mlock_block(iq, block_id); posix_madvise(block, IQUEUE_BLOCK_SIZE, POSIX_MADV_SEQUENTIAL); @@ -484,43 +490,43 @@ _iqueue_unmap( { if (iq->prefetch_thread) { - TSLOGX(TSINFO, "%s: Shutdown prefetch thread", iq->name); - pthread_cancel(iq->prefetch_thread); - pthread_join(iq->prefetch_thread, NULL); - iq->prefetch_thread = 0; + TSLOGXL(TSINFO, "%s: Shutdown prefetch thread", iq->name); + pthread_cancel(iq->prefetch_thread); + pthread_join(iq->prefetch_thread, NULL); + iq->prefetch_thread = 0; } if (iq->syncbehind_thread) { - TSLOGX(TSINFO, "%s: Shutdown syncbehind thread", iq->name); - pthread_cancel(iq->syncbehind_thread); - pthread_join(iq->syncbehind_thread, NULL); - iq->syncbehind_thread = 0; + TSLOGXL(TSINFO, "%s: Shutdown syncbehind thread", iq->name); + pthread_cancel(iq->syncbehind_thread); + pthread_join(iq->syncbehind_thread, NULL); + iq->syncbehind_thread = 0; } if (iq->fd >= 0) { - close(iq->fd); - iq->fd = -1; + close(iq->fd); + iq->fd = -1; } // Don't unmap idx if it was shared with iq->blocks[0] if ((void*) iq->idx != iq->blocks[0] && iq->idx) { - TSLOGX(TSDEBUG, "%s: Unmapping idx %p", iq->name, iq->idx); - munmap(iq->idx, IQUEUE_BLOCK_SIZE); + TSLOGXL(TSDEBUG, "%s: Unmapping idx %p", iq->name, iq->idx); + munmap(iq->idx, IQUEUE_BLOCK_SIZE); } iq->idx = NULL; for (unsigned i = 0 ; i < IQUEUE_BLOCK_COUNT ; i++) { - if (!iq->blocks[i]) - continue; + if (!iq->blocks[i]) + continue; - TSLOGX(TSDEBUG, "%s: Unmapping block %u: %p", iq->name, i, iq->blocks[i]); - munmap(iq->blocks[i], IQUEUE_BLOCK_SIZE); - iq->blocks[i] = NULL; + TSLOGXL(TSDEBUG, "%s: Unmapping block %u: %p", iq->name, i, iq->blocks[i]); + munmap(iq->blocks[i], IQUEUE_BLOCK_SIZE); + iq->blocks[i] = NULL; } // Trash the cached table lookup so that we won't use it @@ -540,31 +546,30 @@ _iqueue_reopen( int serrno; if (iq->fd >= 0) - _iqueue_unmap(iq); + _iqueue_unmap(iq); const int open_flags = iq->open_flags | (create_flag ? O_CREAT : 0); const char * const open_str = - open_flags == O_RDONLY ? "readonly" : - open_flags == O_RDWR ? "readwrite" : - "create"; + open_flags == O_RDONLY ? "readonly" : + open_flags == O_RDWR ? "readwrite" : + "create"; iq->fd = open( - iq->name, - open_flags, - iq->open_mode + iq->name, + open_flags, + iq->open_mode ); if (iq->fd < 0) { - if (errno == ENOENT) - { - TSLOGX(TSWARN, "%s: No such file or directory", iq->name); - sleep(1); // force a short wait - errno = ENOENT; - return -1; - } - - TSLOGX(TSERROR, "%s: Unable to open %s", iq->name, open_str); - goto fail; + if (errno == ENOENT) + { + TSLOGXL(TSWARN, "%s: No such file or directory", iq->name); + errno = ENOENT; + return -1; + } + + TSLOGXL(TSERROR, "%s: Unable to open %s", iq->name, open_str); + goto fail; } // If we are creating the iqueue for the first time we are allowed @@ -572,26 +577,26 @@ _iqueue_reopen( // temporary file. if (create_flag) { - iq->idx = _iqueue_map_block(iq, 0); - if (iq->idx == NULL) - goto fail; - return 0; + iq->idx = _iqueue_map_block(iq, 0); + if (iq->idx == NULL) + goto fail; + return 0; } // We can't use _iqueue_map_block() since that might modify an existing // file. Instead we have to just map a minimal segment at first void * block = mmap( - NULL, - IQUEUE_BLOCK_SIZE, - iq->mmap_prot, - iq->mmap_flags, - iq->fd, - 0 + NULL, + IQUEUE_BLOCK_SIZE, + iq->mmap_prot, + iq->mmap_flags, + iq->fd, + 0 ); if (block == MAP_FAILED) { - TSLOG(TSERROR, "%s: Failed to map index header", iq->name); - goto fail; + TSLOGL(TSERROR, "%s: Failed to map index header", iq->name); + goto fail; } iq->idx = block; @@ -600,16 +605,16 @@ _iqueue_reopen( struct stat sb; if (fstat(iq->fd, &sb) < 0) { - TSLOG(TSERROR, "%s: Failed to stat", iq->name); - goto fail; + TSLOGL(TSERROR, "%s: Failed to stat", iq->name); + goto fail; } const uint64_t file_size = sb.st_size; if ((size_t) file_size < sizeof(*iq->idx)) { - TSLOGX(TSWARN, "%s: File is much too small. Not an iqx?", iq->name); - goto fail; + TSLOGXL(TSWARN, "%s: File is much too small. Not an iqx?", iq->name); + goto fail; } @@ -617,27 +622,27 @@ _iqueue_reopen( if (iq->idx->magic != IQUEUE_INDEX_MAGIC || iq->idx->version != IQUEUE_VERSION) { - TSLOGX(TSERROR, - "%s: Magic %"PRIx32".%"PRIx32" != expected %"PRIx32".%"PRIx32, - iq->name, - iq->idx->magic, - iq->idx->version, - IQUEUE_INDEX_MAGIC, - IQUEUE_VERSION - ); - goto fail; + TSLOGXL(TSERROR, + "%s: Magic %"PRIx32".%"PRIx32" != expected %"PRIx32".%"PRIx32, + iq->name, + iq->idx->magic, + iq->idx->version, + IQUEUE_INDEX_MAGIC, + IQUEUE_VERSION + ); + goto fail; } // Everything looks ok so far. - TSLOGX(TSDEBUG, - "%s: %s: creation %"PRIu64", entries %"PRIu64", data %"PRIu64", size %"PRIu64" %s", - iq->name, - open_str, - iq->idx->creation_time, - iq->idx->index_tail, - iq->idx->data_tail, - file_size, - iqueue_is_sealed(iq) ? ", sealed" : "" + TSLOGXL(TSDEBUG, + "%s: %s: creation %"PRIu64", entries %"PRIu64", data %"PRIu64", size %"PRIu64" %s", + iq->name, + open_str, + iq->idx->creation_time, + iq->idx->index_tail, + iq->idx->data_tail, + file_size, + iqueue_is_sealed(iq) ? ", sealed" : "" ); iq->last_grow_size = file_size; @@ -669,7 +674,7 @@ _iqueue_init( { iqueue_t * const iq = calloc(1, sizeof(*iq)); if (!iq) - goto fail_iqueue_malloc; + goto fail_iqueue_malloc; int open_flags = O_RDONLY; int open_mode = 0444; @@ -678,25 +683,25 @@ _iqueue_init( if (create_flag >= 0) { - open_flags = O_RDWR; - open_mode |= 0222; - mmap_prot |= PROT_WRITE; + open_flags = O_RDWR; + open_mode |= 0222; + mmap_prot |= PROT_WRITE; } memcpy(iq, &(iqueue_t) { - .fd = -1, - .table_cache = -1, - .name = filename, - .open_flags = open_flags, - .open_mode = open_mode, - .mmap_prot = mmap_prot, - .mmap_flags = mmap_flags, + .fd = -1, + .table_cache = -1, + .name = filename, + .open_flags = open_flags, + .open_mode = open_mode, + .mmap_prot = mmap_prot, + .mmap_flags = mmap_flags, }, sizeof(*iq)); sem_init(&iq->prefetch_sem, 0, 0); if (_iqueue_reopen(iq, create_flag == 1) < 0) - goto fail_reopen; + goto fail_reopen; return iq; @@ -716,19 +721,36 @@ iqueue_open( { char * const filename = strdup(index_filename); if (!filename) - return NULL; + return NULL; iqueue_t * const iq = _iqueue_init(filename, writeable ? 0 : -1); if (!iq) - return NULL; + return NULL; if (writeable) - iqueue_prefetch(iq, 0, 16 << 20); + iqueue_prefetch(iq, 0, 16 << 20); return iq; } +int +iqueue_update_creation_time( + iqueue_t * iq, + uint64_t creation_time +) +{ + if (iq->open_flags != O_RDWR) { + TSLOGX(TSERROR, "iqueue is not writable"); + return -1; + } + uint64_t *creation_time_ptr = __UNCONST_T(uint64_t *, &iq->idx->creation_time); + *creation_time_ptr = creation_time; + + return 0; +} + + iqueue_t * iqueue_create( const char * index_filename, @@ -738,69 +760,117 @@ iqueue_create( ) { if (creation == 0) - creation = tsclock_getnanos(0); + creation = tsclock_getnanos(0); - if (hdr_len > IQUEUE_MAX_HDR_LEN) + // If the argument is a symlink, resolve it and work on the destination. + static_assert(PATH_MAX > 0, "PATH_MAX is not defined as a positive value, need to replace PATH_MAX"); + if (strlen(index_filename) > PATH_MAX) { - TSLOGX(TSERROR, "%s: Header len %zu > max %zu", - index_filename, - hdr_len, - IQUEUE_MAX_HDR_LEN - ); - + TSLOGX(TSERROR, "%s: Iqueue path longer than PATH_MAX(%d)", index_filename, PATH_MAX); return NULL; } - const int namelen = strlen(index_filename); + char index_resolved[PATH_MAX+1]; + char index_buffer[PATH_MAX+1]; + strcpy(index_resolved, index_filename); + + ssize_t resolved_len = 0; + size_t symlink_hops = 0; + while (1) + { + resolved_len = readlink(index_resolved, index_buffer, PATH_MAX); + int readlink_errno = errno; + if (resolved_len == -1 && (readlink_errno == EINVAL || readlink_errno == ENOENT)) + { + // Found a path that either is not a symlink or doesn't exist yet. + break; + } + else if (resolved_len == -1) + { + TSLOGX(TSERROR, "%s: Could not resolve real path (broken at: %s, reason: %s)", + index_filename, index_resolved, strerror(readlink_errno)); + return NULL; + } + else if (resolved_len == PATH_MAX) + { + TSLOGX(TSERROR, "%s: Could not resolve real path (path too long: %s, max %d)", + index_filename, index_resolved, PATH_MAX); + return NULL; + } + memcpy(index_resolved, index_buffer, resolved_len); + index_resolved[resolved_len] = '\0'; + + ++symlink_hops; + if (symlink_hops > _SC_SYMLOOP_MAX) { + TSLOGX(TSERROR, "%s: Could not resolve real path (in a loop: stoppped at %s, symlink hops: %zu)", + index_filename, index_resolved, symlink_hops); + return NULL; + } + } + TSLOGX(TSINFO, "%s: Will use resolved path: %s", index_filename, index_resolved); + + if (hdr_len > IQUEUE_MAX_HDR_LEN) + { + TSLOGXL(TSERROR, "%s (%s): Header len %zu > max %zu", + index_filename, + index_resolved, + hdr_len, + IQUEUE_MAX_HDR_LEN + ); + + return NULL; + } + + const int namelen = strlen(index_resolved); char * filename = calloc(1, namelen + 32); if (!filename) - goto fail_filename_alloc; - snprintf(filename, namelen+32, "%s.%"PRIx64, index_filename, creation); + goto fail_filename_alloc; + snprintf(filename, namelen+32, "%s.%"PRIx64, index_resolved, creation); iqueue_t * const iq = _iqueue_init(filename, 1); if (!iq) - goto fail_iq_alloc; + goto fail_iq_alloc; // Fill in the required fields and user header memcpy(iq->idx, &(iqueue_index_t) { - .magic = IQUEUE_INDEX_MAGIC, - .version = IQUEUE_VERSION, - .creation_time = creation, - .hdr_len = hdr_len, - .index_tail = 0, - .data_tail = sizeof(*iq->idx) - + IQUEUE_TABLE_SIZE * sizeof(*iq->idx->tables), + .magic = IQUEUE_INDEX_MAGIC, + .version = IQUEUE_VERSION, + .creation_time = creation, + .hdr_len = hdr_len, + .index_tail = 0, + .data_tail = sizeof(*iq->idx) + + IQUEUE_TABLE_SIZE * sizeof(*iq->idx->tables), }, sizeof(*iq->idx)); memcpy(iq->idx->hdr, hdr, hdr_len); // The file is fully built on disk. Attempt to atomically swap it for // the real one. - if (link(filename, index_filename) == -1) + if (link(filename, index_resolved) == -1) { - if (errno != EEXIST) - { - TSLOG(TSERROR, "%s: Unable to link from %s", index_filename, filename); - goto fail_link; - } - - // Remove our temp file, and trailing creation time - unlink(filename); - filename[namelen] = '\0'; - - // Clean up and try to open it as a normal - // iqueue. The caller will know that they lost the race since - // the creation time will not be the same as the one they specified - // \note: Do not goto the failure path since we do not want to unlink - // the actual iqueue file. - TSLOGX(TSINFO, "%s: Lost creation race. Retrying", index_filename); - if (_iqueue_reopen(iq, 0) < 0) - { - iqueue_close(iq); - return NULL; - } - - return iq; + if (errno != EEXIST) + { + TSLOGL(TSERROR, "%s (%s): Unable to link from %s", index_filename, index_resolved, filename); + goto fail_link; + } + + // Remove our temp file, and trailing creation time + unlink(filename); + filename[namelen] = '\0'; + + // Clean up and try to open it as a normal + // iqueue. The caller will know that they lost the race since + // the creation time will not be the same as the one they specified + // \note: Do not goto the failure path since we do not want to unlink + // the actual iqueue file. + TSLOGXL(TSINFO, "%s (%s): Lost creation race. Retrying", index_filename, index_resolved); + if (_iqueue_reopen(iq, 0) < 0) + { + iqueue_close(iq); + return NULL; + } + + return iq; } // We won the race. Unlink our temp file, update our name and keep going @@ -855,16 +925,16 @@ iqueue_archive( // Attempt to seal the iqueue at this id if one is provided if (seal_id != IQUEUE_MSG_BAD_ID) { - int rc = iqueue_try_seal(iq, seal_id); - if (rc != 0) - { - TSLOGX(TSDEBUG, "%s: Failed to seal at id %"PRIu64": rc=%d", - iq->name, - seal_id, - rc - ); - return rc; - } + int rc = iqueue_try_seal(iq, seal_id); + if (rc != 0) + { + TSLOGXL(TSDEBUG, "%s: Failed to seal at id %"PRIu64": rc=%d", + iq->name, + seal_id, + rc + ); + return rc; + } } // We have successfully sealed the iqueue (or are not doing so) @@ -872,34 +942,34 @@ iqueue_archive( const size_t namelen = strlen(old_name) + 32; char * new_name = calloc(1, namelen); if (!new_name) - return -1; + return -1; snprintf(new_name, namelen, - "%s.%"PRIu64, - old_name, - iqueue_creation(iq) + "%s.%"PRIu64, + old_name, + iqueue_creation(iq) ); if (!iqueue_is_sealed(iq)) - TSLOGX(TSWARN, "%s: Archiving an unsealed iqueue", old_name); + TSLOGXL(TSWARN, "%s: Archiving an unsealed iqueue", old_name); if (link(old_name, new_name) == -1) { - TSLOG(TSERROR, "%s: Unable to create link to archive %s", - old_name, - new_name - ); - return -1; + TSLOGL(TSERROR, "%s: Unable to create link to archive %s", + old_name, + new_name + ); + return -1; } if (unlink(old_name) == -1) { - TSLOG(TSERROR, "%s: Unable to unlink", old_name); - unlink(new_name); - return -1; + TSLOGL(TSERROR, "%s: Unable to unlink", old_name); + unlink(new_name); + return -1; } - TSLOGX(TSDEBUG, "%s: Archived to %s", iq->name, new_name); + TSLOGXL(TSDEBUG, "%s: Archived to %s", iq->name, new_name); return 0; } @@ -915,25 +985,25 @@ iqueue_entries( while (1) { - switch (iqueue_status(iq, tail)) - { - case IQUEUE_STATUS_HAS_DATA: - // There are still entries out there. Try the next one - tail++; - continue; - - case IQUEUE_STATUS_NO_DATA: - // We have found the end. - return tail; - - case IQUEUE_STATUS_SEALED: - // We are one past the actual end of data - return tail - 1; - - default: - // Something is very wrong - return (uint64_t) -1; - } + switch (iqueue_status(iq, tail)) + { + case IQUEUE_STATUS_HAS_DATA: + // There are still entries out there. Try the next one + tail++; + continue; + + case IQUEUE_STATUS_NO_DATA: + // We have found the end. + return tail; + + case IQUEUE_STATUS_SEALED: + // We are one past the actual end of data + return tail - 1; + + default: + // Something is very wrong + return (uint64_t) -1; + } } } @@ -986,8 +1056,8 @@ iqueue_mlock( for (unsigned block_id = 0 ; block_id < IQUEUE_BLOCK_COUNT ; block_id++) { - if (iqueue_mlock_block(iq, block_id) == -1) - return -1; + if (iqueue_mlock_block(iq, block_id) == -1) + return -1; } return 0; @@ -1010,18 +1080,18 @@ iqueue_get_data( offset &= IQUEUE_BLOCK_MASK; if (unlikely(block_id >= IQUEUE_BLOCK_COUNT)) - return NULL; + return NULL; uint8_t * block = iq->blocks[block_id]; if (unlikely(block == NULL)) { - if (!do_map) - return NULL; + if (!do_map) + return NULL; - // They want it mapped. - block = _iqueue_map_block(iq, block_id); - if (!block) - return NULL; + // They want it mapped. + block = _iqueue_map_block(iq, block_id); + if (!block) + return NULL; } return block + offset; @@ -1036,13 +1106,13 @@ iqueue_allocate_raw( ) { if (unlikely(!offset_out || len >= IQUEUE_BLOCK_SIZE)) - return NULL; + return NULL; if (unlikely((iq->mmap_prot & PROT_WRITE) == 0)) { - if (!iq->warned_readonly_allocate) - TSLOGX(TSWARN, "%s: Attempt to allocate from read-only iqueue", iq->name); - iq->warned_readonly_allocate = 1; - return NULL; + if (!iq->warned_readonly_allocate) + TSLOGXL(TSWARN, "%s: Attempt to allocate from read-only iqueue", iq->name); + iq->warned_readonly_allocate = 1; + return NULL; } iqueue_index_t * const idx = iq->idx; @@ -1050,22 +1120,22 @@ iqueue_allocate_raw( while (1) { - uint64_t tail = offset = idx->data_tail; - uint64_t new_tail = tail + len; - - // Check to see if this would cross a 1 GB boundary and adjust - // the allocation upwards to avoid the boundary. We also - // must avoid the first 64 bytes of the new block to avoid - // the markers. - if ((tail >> IQUEUE_BLOCK_SHIFT) != (new_tail >> IQUEUE_BLOCK_SHIFT)) - { - offset = ((tail >> IQUEUE_BLOCK_SHIFT) + 1) << IQUEUE_BLOCK_SHIFT; - offset += sizeof(iqueue_block_t); - new_tail = offset + len; - } - - if (atomic_cas_bool_64(&idx->data_tail, tail, new_tail)) - break; + uint64_t tail = offset = idx->data_tail; + uint64_t new_tail = tail + len; + + // Check to see if this would cross a 1 GB boundary and adjust + // the allocation upwards to avoid the boundary. We also + // must avoid the first 64 bytes of the new block to avoid + // the markers. + if ((tail >> IQUEUE_BLOCK_SHIFT) != (new_tail >> IQUEUE_BLOCK_SHIFT)) + { + offset = ((tail >> IQUEUE_BLOCK_SHIFT) + 1) << IQUEUE_BLOCK_SHIFT; + offset += sizeof(iqueue_block_t); + new_tail = offset + len; + } + + if (atomic_compare_exchange_strong(&idx->data_tail, &tail, new_tail)) + break; } // We have updated the idx->data_tail and can start to fill @@ -1073,7 +1143,7 @@ iqueue_allocate_raw( // until iqueue_update() is called on the data pointer. void * const data = (void*)(uintptr_t) iqueue_get_data(iq, offset, 1); if (!data) - return NULL; + return NULL; *offset_out = iqueue_msg(offset, len); return data; @@ -1098,16 +1168,17 @@ iqueue_cas( { // If the value is still 0, write our msg offset into it // If we fail, return immediately. - if (unlikely(!atomic_cas_bool_64( - (uint64_t*)(uintptr_t)&slot->v, - 0, - new_msg.v + uint64_t const zero = 0; + if (unlikely(!atomic_compare_exchange_strong( + (uint64_t*)(uintptr_t)&slot->v, + &zero, + new_msg.v ))) - return 0; + return 0; // Do not advance the tail for seal messages, so reader can see them if (unlikely(new_msg.v == IQUEUE_MSG_SEALED)) - return 1; + return 1; // We have written our offset into slot id, try to advance // the tail to one past where we are, but do not care if @@ -1115,9 +1186,9 @@ iqueue_cas( // the idx for us. const uint64_t current_tail = idx->index_tail; if (current_tail >= id + 1) - return 1; + return 1; - atomic_cas_bool_64(&idx->index_tail, current_tail, id + 1); + atomic_compare_exchange_strong(&idx->index_tail, ¤t_tail, id + 1); return 1; } @@ -1134,12 +1205,12 @@ iqueue_new_table( iqueue_index_t * const idx = iq->idx; iqueue_msg_t msg; const void * const table_buf = iqueue_allocate_raw( - iq, - IQUEUE_TABLE_SIZE * sizeof(*idx->tables) + 32, - &msg + iq, + IQUEUE_TABLE_SIZE * sizeof(*idx->tables) + 32, + &msg ); if (!table_buf) - return 0; + return 0; uint64_t offset = iqueue_msg_offset(msg); @@ -1147,14 +1218,15 @@ iqueue_new_table( offset = (offset + 31) & ~31; // Attempt to store the new table into the correct slot - if (atomic_cas_bool_64(&idx->tables[table_num], 0, offset)) + uint64_t const zero = 0; + if (atomic_compare_exchange_strong(&idx->tables[table_num], &zero, offset)) { - TSLOGX(TSDIAG, "%s: New tables[%"PRIu64"] = %"PRIx64, - iq->name, - table_num, - offset - ); - return offset; + TSLOGXL(TSDIAG, "%s: New tables[%"PRIu64"] = %"PRIx64, + iq->name, + table_num, + offset + ); + return offset; } // We lost the race, but now have a huge allocation. Try @@ -1163,8 +1235,8 @@ iqueue_new_table( while (++table_num < IQUEUE_TABLE_SIZE) { - if (atomic_cas_bool_64(&idx->tables[table_num], 0, offset)) - return correct_offset; + if (atomic_compare_exchange_strong(&idx->tables[table_num], &zero, offset)) + return correct_offset; } // We've fully populated the tables? This really shouldn't happen, @@ -1191,33 +1263,33 @@ iqueue_get_slot( if (likely((last_table & IQUEUE_TABLE_MASK) == table_num)) { - // Hurrah! We hit our cached value - table_offset = last_table >> IQUEUE_TABLE_SHIFT; + // Hurrah! We hit our cached value + table_offset = last_table >> IQUEUE_TABLE_SHIFT; } else { - // Not the cached value; do a full lookup - if (unlikely(table_num > IQUEUE_TABLE_SIZE)) - return NULL; - - table_offset = iq->idx->tables[table_num]; - if (unlikely(!table_offset)) - { - // There is no table for this id yet, create it if requested - if (!create) - return NULL; - - table_offset = iqueue_new_table(iq, table_num); - if (!table_offset) - return NULL; - } - - // We have a pointer to the table; cache the value and return the table - iq->table_cache = (table_offset << IQUEUE_TABLE_SHIFT) | table_num; + // Not the cached value; do a full lookup + if (unlikely(table_num > IQUEUE_TABLE_SIZE)) + return NULL; + + table_offset = iq->idx->tables[table_num]; + if (unlikely(!table_offset)) + { + // There is no table for this id yet, create it if requested + if (!create) + return NULL; + + table_offset = iqueue_new_table(iq, table_num); + if (!table_offset) + return NULL; + } + + // We have a pointer to the table; cache the value and return the table + iq->table_cache = (table_offset << IQUEUE_TABLE_SHIFT) | table_num; } // We have a table offset now; find the actual block that goes with it iqueue_msg_t * const table = (void*)(uintptr_t) iqueue_get_data(iq, table_offset, 1); if (!table) - return NULL; + return NULL; return &table[offset]; } @@ -1235,33 +1307,33 @@ iqueue_try_update_internal( if (unlikely(tail != id)) { - // If the id they are trying to write to is less than the - // current tail, it is guaranteed to fail since there is already - // something written there. - if (id < tail) - return IQUEUE_STATUS_HAS_DATA; - - // If the id they are trying to write to is not at the tail - // position, then it would leave a hole in the index. This is - // not allowed, so the index might be invalid. To confirm, - // check to see if the actual entries value is wrong. - if (id > iqueue_entries(iq)) - return IQUEUE_STATUS_INDEX_INVALID; - - // It was a spurious case of the tail being wrong; allow the - // iqueue_try_update() to proceed. + // If the id they are trying to write to is less than the + // current tail, it is guaranteed to fail since there is already + // something written there. + if (id < tail) + return IQUEUE_STATUS_HAS_DATA; + + // If the id they are trying to write to is not at the tail + // position, then it would leave a hole in the index. This is + // not allowed, so the index might be invalid. To confirm, + // check to see if the actual entries value is wrong. + if (id > iqueue_entries(iq)) + return IQUEUE_STATUS_INDEX_INVALID; + + // It was a spurious case of the tail being wrong; allow the + // iqueue_try_update() to proceed. } iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 1); if (unlikely(!slot)) - return IQUEUE_STATUS_INDEX_INVALID; + return IQUEUE_STATUS_INDEX_INVALID; if (likely(iqueue_cas(idx, slot, id, new_msg))) - return 0; + return 0; // We lost. Check for the possibility that the iqueue has been sealed if (unlikely(slot->v == IQUEUE_MSG_SEALED)) - return IQUEUE_STATUS_SEALED; + return IQUEUE_STATUS_SEALED; return IQUEUE_STATUS_HAS_DATA; } @@ -1280,29 +1352,29 @@ iqueue_update_internal( // Find the next available slot while (1) { - const iqueue_id_t id = idx->index_tail; - iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 1); - if (unlikely(!slot)) - return IQUEUE_STATUS_INDEX_INVALID; - - if (unlikely(slot->v == IQUEUE_MSG_SEALED)) - return IQUEUE_STATUS_SEALED; - - if (unlikely(slot->v)) - { - // The list is in an inconsistent state; try to advance - // the tail pointer. - atomic_cas_bool_64(&idx->index_tail, id, id+1); - continue; - } - - // Write to user slot before attempting the CAS to preserve - // all lockless guarantees. - if (id_out != NULL) - *id_out = (is_id_be ? htobe64(id) : id); - - if (likely(iqueue_cas(idx, slot, id, new_msg))) - return 0; + const iqueue_id_t id = idx->index_tail; + iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 1); + if (unlikely(!slot)) + return IQUEUE_STATUS_INDEX_INVALID; + + if (unlikely(slot->v == IQUEUE_MSG_SEALED)) + return IQUEUE_STATUS_SEALED; + + if (unlikely(slot->v)) + { + // The list is in an inconsistent state; try to advance + // the tail pointer. + atomic_compare_exchange_strong(&idx->index_tail, &id, id+1); + continue; + } + + // Write to user slot before attempting the CAS to preserve + // all lockless guarantees. + if (id_out != NULL) + *id_out = (is_id_be ? htobe64(id) : id); + + if (likely(iqueue_cas(idx, slot, id, new_msg))) + return 0; } } @@ -1357,7 +1429,7 @@ iqueue_seal( ) { const iqueue_msg_t new_msg = { - .v = IQUEUE_MSG_SEALED + .v = IQUEUE_MSG_SEALED }; return iqueue_update_internal(iq, new_msg, NULL, 0); @@ -1371,7 +1443,7 @@ iqueue_try_seal( ) { const iqueue_msg_t new_msg = { - .v = IQUEUE_MSG_SEALED + .v = IQUEUE_MSG_SEALED }; return iqueue_try_update_internal(iq, id, new_msg); @@ -1404,13 +1476,13 @@ iqueue_status( ) { iqueue_msg_t * const slot - = iqueue_get_slot(__UNCONST_T(iqueue_t*, iq), id, 0); + = iqueue_get_slot(__UNCONST_T(iqueue_t*, iq), id, 0); if (!slot || !slot->v) - return IQUEUE_STATUS_NO_DATA; + return IQUEUE_STATUS_NO_DATA; if (slot->v == IQUEUE_MSG_SEALED) - return IQUEUE_STATUS_SEALED; + return IQUEUE_STATUS_SEALED; return IQUEUE_STATUS_HAS_DATA; } @@ -1429,51 +1501,51 @@ iqueue_status_wait( while (1) { - if (!msg) - { - // Try to get the slot, but do not modify the tables. - // If we are blocking forever, keep trying - msg = iqueue_get_slot(iq, id, 0); - if (!msg && timeout_ns >= 0 && timeout_ns < 10) - return IQUEUE_STATUS_NO_DATA; - } - - if (msg) - { - // We have the slot; try - // Try a few times before checking the clock - for (int i = 0 ; i < 1000 ; i++) - { - if (unlikely(!msg->v)) { - if (timeout_ns == 0) - return IQUEUE_STATUS_NO_DATA; - - _mm_pause(); - continue; - } - - // Check if the iqueue is sealed at this index - if (unlikely(msg->v == IQUEUE_MSG_SEALED)) - return IQUEUE_STATUS_SEALED; - - // Build a user-space pointer from the pointer - return IQUEUE_STATUS_HAS_DATA; - } - } - - // timeout == -1 means loop forever - if (timeout_ns == -1) - continue; - - // timeout < 10 means just check the queue a few times - if (timeout_ns < 10) - break; - - if (!start_time) - start_time = tsclock_getnanos(0); - - if (start_time + timeout_ns < tsclock_getnanos(0)) - break; + if (!msg) + { + // Try to get the slot, but do not modify the tables. + // If we are blocking forever, keep trying + msg = iqueue_get_slot(iq, id, 0); + if (!msg && timeout_ns >= 0 && timeout_ns < 10) + return IQUEUE_STATUS_NO_DATA; + } + + if (msg) + { + // We have the slot; try + // Try a few times before checking the clock + for (int i = 0 ; i < 1000 ; i++) + { + if (unlikely(!msg->v)) { + if (timeout_ns == 0) + return IQUEUE_STATUS_NO_DATA; + + _mm_pause(); + continue; + } + + // Check if the iqueue is sealed at this index + if (unlikely(msg->v == IQUEUE_MSG_SEALED)) + return IQUEUE_STATUS_SEALED; + + // Build a user-space pointer from the pointer + return IQUEUE_STATUS_HAS_DATA; + } + } + + // timeout == -1 means loop forever + if (timeout_ns == -1) + continue; + + // timeout < 10 means just check the queue a few times + if (timeout_ns < 10) + break; + + if (!start_time) + start_time = tsclock_getnanos(0); + + if (start_time + timeout_ns < tsclock_getnanos(0)) + break; } return IQUEUE_STATUS_NO_DATA; @@ -1491,20 +1563,20 @@ iqueue_offset( // Retrieve the map-space pointer iqueue_msg_t * const msg_ptr = iqueue_get_slot(iq, id, 0); if (unlikely(!msg_ptr)) - return -1; + return -1; iqueue_msg_t msg = *msg_ptr; if (unlikely(!msg.v)) - return -1; + return -1; // Check if the iqueue is sealed at this index if (unlikely(msg.v == IQUEUE_MSG_SEALED)) - return -1; + return -1; if (likely(size_out)) - *size_out = iqueue_msg_len(msg); + *size_out = iqueue_msg_len(msg); - TSLOGX(TSDIAG, "%s: %"PRIx64": %p = %"PRIx64, iq->name, id, msg_ptr, msg.v); + TSLOGXL(TSDIAG, "%s: %"PRIx64": %p = %"PRIx64, iq->name, id, msg_ptr, msg.v); return iqueue_msg_offset(msg); } @@ -1518,11 +1590,11 @@ iqueue_is_sealed( while (1) { - int status = iqueue_status(iqueue, id++); - if (status == IQUEUE_STATUS_HAS_DATA) - continue; + int status = iqueue_status(iqueue, id++); + if (status == IQUEUE_STATUS_HAS_DATA) + continue; - return status == IQUEUE_STATUS_SEALED; + return status == IQUEUE_STATUS_SEALED; } } @@ -1536,13 +1608,13 @@ iqueue_allocator_init( ) { memcpy(allocator, &(iqueue_allocator_t) { - .iq = iq, - .bulk_len = bulk_len, - .auto_refill = auto_refill, + .iq = iq, + .bulk_len = bulk_len, + .auto_refill = auto_refill, }, sizeof(*allocator)); if (iqueue_allocator_refill(allocator) < 0) - return -1; + return -1; return 0; } @@ -1556,21 +1628,21 @@ iqueue_allocator_refill( { iqueue_msg_t msg; allocator->base = iqueue_allocate_raw( - allocator->iq, - allocator->bulk_len, - &msg + allocator->iq, + allocator->bulk_len, + &msg ); if (!allocator->base) - return -1; + return -1; allocator->base_offset = iqueue_msg_offset(msg); allocator->offset = 0; - TSLOGX(TSDEBUG, "%s: Refill base=%p offset=%"PRIx64" len=%"PRIx64, - allocator->iq->name, - allocator->base, - allocator->base_offset, - allocator->bulk_len + TSLOGXL(TSDEBUG, "%s: Refill base=%p offset=%"PRIx64" len=%"PRIx64, + allocator->iq->name, + allocator->base, + allocator->base_offset, + allocator->bulk_len ); return 0; @@ -1588,10 +1660,10 @@ iqueue_realloc( const uint64_t msg_len = iqueue_msg_len(*msg); return iqueue_realloc_bulk( - allocator, - msg, - msg_len, - new_len + allocator, + msg, + msg_len, + new_len ); } @@ -1606,7 +1678,7 @@ iqueue_realloc_bulk( { const uint64_t msg_offset = iqueue_msg_offset(*msg); if (new_len > IQUEUE_MSG_MAX || new_len > msg_len) - return -1; + return -1; // Where was the offset after this message was allocated const uint64_t cur_offset = msg_offset + msg_len - allocator->base_offset; @@ -1618,9 +1690,9 @@ iqueue_realloc_bulk( // then further allocations have been done and we can't // resize this one. // \todo: Can we do this with atomics to save on locking? - //return atomic_cas_bool_64(&allocator->offset, cur_offset, new_offset); + //return atomic_compare_exchange_strong(&allocator->offset, &cur_offset, new_offset); if (allocator->offset != cur_offset) - return 0; + return 0; allocator->offset = new_offset; *msg = iqueue_msg(msg_offset, new_len); return 1; @@ -1637,17 +1709,18 @@ iqueue_prefetch( { for (uint64_t offset = 0 ; offset < extent ; offset += 4096) { - volatile uint64_t * data = (void*)(uintptr_t) iqueue_get_data(iq, base + offset, 1); - if (!data) - { - TSLOGX(TSERROR, "%s: Unable to get data at offset %"PRIx64, iq->name, base + offset); - return -1; - } - - if (iq->mmap_prot & PROT_WRITE) - atomic_cas_bool_64(data, 0, 0); - else - data[0]; + volatile uint64_t * data = (void*)(uintptr_t) iqueue_get_data(iq, base + offset, 1); + if (!data) + { + TSLOGXL(TSERROR, "%s: Unable to get data at offset %"PRIx64, iq->name, base + offset); + return -1; + } + + uint64_t const zero = 0; + if (iq->mmap_prot & PROT_WRITE) + atomic_compare_exchange_strong(data, &zero, 0); + else + data[0]; } return 0; @@ -1671,27 +1744,27 @@ prefetch_thread( while (1) { - if (prefetch_delay) - usleep(prefetch_delay); - - if (iq->idx->data_tail + prefetch_size / 2 < offset) - continue; - - // They have used up more than half our last block. - // Start prefetching the next block - uint64_t prefetch_time = -tsclock_getnanos(0); - if (iqueue_prefetch(iq, offset, prefetch_size) < 0) - break; - - prefetch_time += tsclock_getnanos(0); - TSLOGX(TSDEBUG, "%s: Prefetched %"PRIx64" to %"PRIx64" in %"PRIu64" ns", - iq->name, - offset, - offset + prefetch_size, - prefetch_time - ); - - offset += prefetch_size; + if (prefetch_delay) + usleep(prefetch_delay); + + if (iq->idx->data_tail + prefetch_size / 2 < offset) + continue; + + // They have used up more than half our last block. + // Start prefetching the next block + uint64_t prefetch_time = -tsclock_getnanos(0); + if (iqueue_prefetch(iq, offset, prefetch_size) < 0) + break; + + prefetch_time += tsclock_getnanos(0); + TSLOGXL(TSDEBUG, "%s: Prefetched %"PRIx64" to %"PRIx64" in %"PRIu64" ns", + iq->name, + offset, + offset + prefetch_size, + prefetch_time + ); + + offset += prefetch_size; } return NULL; @@ -1707,12 +1780,12 @@ iqueue_prefetch_thread( if (!iq->prefetch_thread && pthread_create(&iq->prefetch_thread, NULL, prefetch_thread, iq) < 0) { - TSLOG(TSERROR, "%s: Unable to create prefetch thread", iq->name); - return -1; + TSLOGL(TSERROR, "%s: Unable to create prefetch thread", iq->name); + return -1; } if (thread_out) - *thread_out = iq->prefetch_thread; + *thread_out = iq->prefetch_thread; return 0; } @@ -1732,27 +1805,27 @@ syncbehind_thread( while (1) { - if (syncbehind_delay) - usleep(syncbehind_delay); - - while (mapped_to_block_id < IQUEUE_BLOCK_COUNT && - iq->blocks[mapped_to_block_id] != NULL) - mapped_to_block_id++; - - // They have used up more than half our last block. - // Start prefetching the next block - for (; mapped_to_block_id - synced_to_block_id > active_block_count; - synced_to_block_id++) { - uint64_t syncbehind_time = -tsclock_getnanos(0); - iqueue_fsync_block(iq, synced_to_block_id); - - syncbehind_time += tsclock_getnanos(0); - TSLOGX(TSINFO, "%s: Synced block %"PRIu64" in %"PRIu64" ns", - iq->name, - synced_to_block_id, - syncbehind_time - ); - } + if (syncbehind_delay) + usleep(syncbehind_delay); + + while (mapped_to_block_id < IQUEUE_BLOCK_COUNT && + iq->blocks[mapped_to_block_id] != NULL) + mapped_to_block_id++; + + // They have used up more than half our last block. + // Start prefetching the next block + for (; mapped_to_block_id - synced_to_block_id > active_block_count; + synced_to_block_id++) { + uint64_t syncbehind_time = -tsclock_getnanos(0); + iqueue_fsync_block(iq, synced_to_block_id); + + syncbehind_time += tsclock_getnanos(0); + TSLOGXL(TSINFO, "%s: Synced block %"PRIu64" in %"PRIu64" ns", + iq->name, + synced_to_block_id, + syncbehind_time + ); + } } return NULL; @@ -1768,12 +1841,12 @@ iqueue_syncbehind_thread( if (!iq->syncbehind_thread && pthread_create(&iq->syncbehind_thread, NULL, syncbehind_thread, iq) < 0) { - TSLOG(TSERROR, "%s: Unable to create syncbehind thread", iq->name); - return -1; + TSLOGL(TSERROR, "%s: Unable to create syncbehind thread", iq->name); + return -1; } if (thread_out) - *thread_out = iq->syncbehind_thread; + *thread_out = iq->syncbehind_thread; return 0; } @@ -1788,48 +1861,48 @@ iqueue_table_debug( for (uint64_t i = 0 ; i < IQUEUE_TABLE_SIZE ; i++) { - const uint64_t offset = iq->idx->tables[i]; - if (!offset) - continue; - - TSLOGX(TSINFO, "%s: table[0x%"PRIx64"] offset 0x%"PRIx64"%s", - iq->name, - i, - offset, - (offset & 0x7) ? " UNALIGNED" : "" - ); - - const uint64_t * const table = iqueue_get_data(iq, offset, 1); - if (!table) - TSABORTX("%s: Unable to get table %"PRIu64"?", iq->name, i); - - for (uint64_t j = 0 ; j <= IQUEUE_TABLE_MASK ; j++) - { - iqueue_msg_t msg = { .v = table[j] }; - const uint64_t off = iqueue_msg_offset(msg); - const uint64_t len = iqueue_msg_len(msg); - if (!off && !len) - { - skipped++; - continue; - } - - if (skipped) - TSLOGX(TSERROR, "%s: Missing indices in table 0x%"PRIx64"!", iq->name, i); - - const struct iqsync_data * const iqsync = iqsync_data_msg(iq, off); - - printf("%"PRIx64",%"PRIx64",%"PRIx64":%"PRIu64",%"PRIx64":%"PRIu64",%"PRIx64",%"PRIu64"\n", - i, - j, - iqsync ? be64toh(iqsync->orig_src) : 0, - iqsync ? be64toh(iqsync->orig_index) : 0, - iqsync ? be64toh(iqsync->src) : 0, - iqsync ? be64toh(iqsync->iq_index) : 0, - off, - len - ); - } + const uint64_t offset = iq->idx->tables[i]; + if (!offset) + continue; + + TSLOGXL(TSINFO, "%s: table[0x%"PRIx64"] offset 0x%"PRIx64"%s", + iq->name, + i, + offset, + (offset & 0x7) ? " UNALIGNED" : "" + ); + + const uint64_t * const table = iqueue_get_data(iq, offset, 1); + if (!table) + TSABORTX("%s: Unable to get table %"PRIu64"?", iq->name, i); + + for (uint64_t j = 0 ; j <= IQUEUE_TABLE_MASK ; j++) + { + iqueue_msg_t msg = { .v = table[j] }; + const uint64_t off = iqueue_msg_offset(msg); + const uint64_t len = iqueue_msg_len(msg); + if (!off && !len) + { + skipped++; + continue; + } + + if (skipped) + TSLOGXL(TSERROR, "%s: Missing indices in table 0x%"PRIx64"!", iq->name, i); + + const struct iqsync_data * const iqsync = iqsync_data_msg(iq, off); + + printf("%"PRIx64",%"PRIx64",%"PRIx64":%"PRIu64",%"PRIx64":%"PRIu64",%"PRIx64",%"PRIu64"\n", + i, + j, + iqsync ? be64toh(iqsync->orig_src) : 0, + iqsync ? be64toh(iqsync->orig_index) : 0, + iqsync ? be64toh(iqsync->src) : 0, + iqsync ? be64toh(iqsync->iq_index) : 0, + off, + len + ); + } } } @@ -1842,63 +1915,63 @@ iqueue_debug( { if (id == (uint64_t) -1) { - iqueue_table_debug(iq); - return; + iqueue_table_debug(iq); + return; } size_t len; uint64_t offset = iqueue_offset(iq, id, &len); if (offset == (uint64_t) -1) { - TSLOGX(TSINFO, "%s: %"PRIu64": No slot allocated", - iq->name, - id - ); - return; + TSLOGXL(TSINFO, "%s: %"PRIu64": No slot allocated", + iq->name, + id + ); + return; } const volatile iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 0); - TSLOGX(TSINFO, "%s: %"PRIu64": offset=%"PRId64" len=%zu slot=%p%s", - iq->name, - id, - offset, - len, - slot, - ((uintptr_t) slot & 7) ? " UNALIGNED" : "" + TSLOGXL(TSINFO, "%s: %"PRIu64": offset=%"PRId64" len=%zu slot=%p%s", + iq->name, + id, + offset, + len, + slot, + ((uintptr_t) slot & 7) ? " UNALIGNED" : "" ); const struct iqsync_data * const msg = iqsync_data_msg(iq, offset); if (msg) { - TSLOGX(TSINFO, "%s: %"PRIu64": sending src=%"PRIu64":%"PRIu64" len=%u", - iq->name, - id, - be64toh(msg->src), - be64toh(msg->iq_index), - be32toh(msg->len) - ); - - TSLOGX(TSINFO, "%s: %"PRIu64": orig src=%"PRIu64":%"PRIu64, - iq->name, - id, - be64toh(msg->orig_src), - be64toh(msg->orig_index) - ); + TSLOGXL(TSINFO, "%s: %"PRIu64": sending src=%"PRIu64":%"PRIu64" len=%u", + iq->name, + id, + be64toh(msg->src), + be64toh(msg->iq_index), + be32toh(msg->len) + ); + + TSLOGXL(TSINFO, "%s: %"PRIu64": orig src=%"PRIu64":%"PRIu64, + iq->name, + id, + be64toh(msg->orig_src), + be64toh(msg->orig_index) + ); } const void * const data = iqueue_get_data(iq, offset, 1); if (!data) { - TSLOGX(TSERROR, - "%s: %"PRIu64": Unable to retrieve data at offset %"PRIu64"?", - iq->name, - id, - offset - ); - return; + TSLOGXL(TSERROR, + "%s: %"PRIu64": Unable to retrieve data at offset %"PRIu64"?", + iq->name, + id, + offset + ); + return; } - TSHDUMP(TSINFO, data, len); + TSHDUMPL(TSINFO, data, len); } @@ -1913,58 +1986,59 @@ _iqueue_writer_table( ) { if (table_id >= IQUEUE_WRITER_TABLES) - return NULL; + return NULL; if (iq->writer_tables[table_id]) - return iq->writer_tables[table_id]; + return iq->writer_tables[table_id]; iqueue_index_t * const idx = iq->idx; iqueue_msg_t table_msg = idx->writer_tables[table_id]; if (table_msg.v == 0) { - if (!create) - return NULL; - - const size_t table_len = IQUEUE_WRITER_MAX * sizeof(shash_entry_t); - const size_t table_max_len = table_len + IQUEUE_WRITER_MASK; - - void * const table_buf = iqueue_allocate_raw( - iq, - table_max_len, - &table_msg - ); - if (!table_buf) - { - TSLOGX(TSERROR, "%s: Unable to allocate table space %zu bytes", - iqueue_name(iq), - table_max_len - ); - return NULL; - } - - // Force alignment of the table since it will have 16-byte - // CAS operations done on it. - uint64_t offset = iqueue_msg_offset(table_msg); - offset = (offset + IQUEUE_WRITER_MASK) & ~IQUEUE_WRITER_MASK; - table_msg = iqueue_msg(offset, table_len); - - // Atomic swap it into the header; if this fails we do not care. - // Some space in the iqueue will leak, but that is not a problem. - atomic_cas_64( - (void*)(uintptr_t) &idx->writer_tables[table_id].v, - 0, - table_msg.v - ); - - // Re-read the writer_table; either we succeeded or someone else has - // already written to it. - table_msg = idx->writer_tables[table_id]; - - TSLOGX(TSINFO, "%s: Created writer table offset 0x%"PRIx64" size %zu", - iqueue_name(iq), - iqueue_msg_offset(table_msg), - iqueue_msg_len(table_msg) - ); + if (!create) + return NULL; + + const size_t table_len = IQUEUE_WRITER_MAX * sizeof(shash_entry_t); + const size_t table_max_len = table_len + IQUEUE_WRITER_MASK; + + void * const table_buf = iqueue_allocate_raw( + iq, + table_max_len, + &table_msg + ); + if (!table_buf) + { + TSLOGXL(TSERROR, "%s: Unable to allocate table space %zu bytes", + iqueue_name(iq), + table_max_len + ); + return NULL; + } + + // Force alignment of the table since it will have 16-byte + // CAS operations done on it. + uint64_t offset = iqueue_msg_offset(table_msg); + offset = (offset + IQUEUE_WRITER_MASK) & ~IQUEUE_WRITER_MASK; + table_msg = iqueue_msg(offset, table_len); + + // Atomic swap it into the header; if this fails we do not care. + // Some space in the iqueue will leak, but that is not a problem. + iqueue_msg_t* const null = NULL; + atomic_compare_exchange_strong( + &idx->writer_tables[table_id].v, + &null, + table_msg.v + ); + + // Re-read the writer_table; either we succeeded or someone else has + // already written to it. + table_msg = idx->writer_tables[table_id]; + + TSLOGXL(TSINFO, "%s: Created writer table offset 0x%"PRIx64" size %zu", + iqueue_name(iq), + iqueue_msg_offset(table_msg), + iqueue_msg_len(table_msg) + ); } const size_t table_len = iqueue_msg_len(table_msg); @@ -1974,22 +2048,22 @@ _iqueue_writer_table( if (!table_buf) { - TSLOGX(TSERROR, "%s: Unable to retrieve table at offset 0x%"PRIx64, - iqueue_name(iq), - table_offset - ); - return NULL; + TSLOGXL(TSERROR, "%s: Unable to retrieve table at offset 0x%"PRIx64, + iqueue_name(iq), + table_offset + ); + return NULL; } shash_t * const sh = shash_create(table_buf, table_len, 0); if (!sh) { - TSLOGX(TSERROR, "%s: Unable to generate table %p @ %zu", - iqueue_name(iq), - table_buf, - table_len - ); - return NULL; + TSLOGXL(TSERROR, "%s: Unable to generate table %p @ %zu", + iqueue_name(iq), + table_buf, + table_len + ); + return NULL; } // This might race with another thread, causing this to leak. @@ -2021,21 +2095,21 @@ iqueue_writer_update( { while (1) { - const uint64_t cur_timestamp = writer->value; - - // If the new value is less than the old value - // (and the old value is not -1), then there is no update - // to be performed. - if (cur_timestamp != (uint64_t) -1 - && cur_timestamp >= new_timestamp) - return 0; - - if (shash_update( - sh, - writer, - cur_timestamp, - new_timestamp - )) - return 1; + const uint64_t cur_timestamp = writer->value; + + // If the new value is less than the old value + // (and the old value is not -1), then there is no update + // to be performed. + if (cur_timestamp != (uint64_t) -1 + && cur_timestamp >= new_timestamp) + return 0; + + if (shash_update( + sh, + writer, + cur_timestamp, + new_timestamp + )) + return 1; } } diff --git a/src/iqueue2out.cc b/src/iqueue2out.cc new file mode 100644 index 0000000..93505d3 --- /dev/null +++ b/src/iqueue2out.cc @@ -0,0 +1,120 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include "try_unix.hh" +#include "iqueue.hh" +#include +#include +using std::experimental::optional; +using std::experimental::nullopt; +#include "stringer.hh" +using stringer::str; +#include + +struct options { + char *iqueue_path; + int outfd; + optional shash_key; + optional latency_tolerance; +}; + +struct options get_options(int argc, char **argv) +try { + if ((argc != 2) && (argc != 4)) throw std::runtime_error("Incorrect number of arguments"); + struct options opt = { + .iqueue_path = argv[1], + .outfd = 1, + .shash_key = nullopt, + .latency_tolerance = nullopt, + }; + try_(fcntl(opt.outfd, F_SETFL, try_(fcntl(opt.outfd, F_GETFL, 0)) & ~O_NONBLOCK)); + if (argc == 4) { + opt.shash_key = std::stoul(argv[2], nullptr, 0); + opt.latency_tolerance = std::stoul(argv[3], nullptr, 0); + } + return opt; +} catch (std::exception const& e) { + warnx("Usage: %s [shash_key] [latency_tolerance]", + (argc > 0 ? argv[0] : "iqueue2out")); + warnx("%s", e.what()); + exit(1); +} + +void iqueue2fd(ts::mmia::cpputils::iqueue& iq, int fd) { + auto it = iq.begin(); + for (;;) { + iq.wait(it); + gsl::span msg = *it; + if (msg.size() > 4096) { + warnx("Message is unrealistically huge. Truncating to 4096."); + msg = { msg.data(), 4096 }; + } + ssize_t written = try_(write(fd, reinterpret_cast(msg.data()), msg.size())); + if (written != msg.size()) { + warnx("Partial write? Did I get called with a non-packetized pipe?"); + exit(1); + } + ++it; + } +} + +void check_heartbeat(ts::mmia::cpputils::shash_heartbeat_entry& entry, std::chrono::nanoseconds latency_tolerance) { + using namespace std::chrono; + time_point const heartbeat_nanos = entry.value(); + time_point const now_nanos = high_resolution_clock::now(); + nanoseconds const time_since_last_heartbeat = now_nanos - heartbeat_nanos; + if (time_since_last_heartbeat > latency_tolerance) { + errx(1, "%s", str("This iqueue is too out-of-date! Exiting: " + "now (", duration_cast(now_nanos.time_since_epoch()).count(), ") ", + "- heartbeat (", + duration_cast(heartbeat_nanos.time_since_epoch()).count(), ") ", + "> latency_tolerance (", latency_tolerance.count(), ")").c_str()); + } +} + +void check_heartbeat_loop(ts::mmia::cpputils::shash_heartbeat_entry entry, std::chrono::nanoseconds latency_tolerance) { + for (;;) { + using namespace std::chrono; + time_point const heartbeat_nanos = entry.value(); + time_point const now_nanos = high_resolution_clock::now(); + nanoseconds const time_since_last_heartbeat = now_nanos - heartbeat_nanos; + if (time_since_last_heartbeat > latency_tolerance) { + errx(1, "%s", str("We failed our timing requirements, " + "now (", duration_cast(now_nanos.time_since_epoch()).count(), ") ", + "- heartbeat (", + duration_cast(heartbeat_nanos.time_since_epoch()).count(), ") ", + "> latency_tolerance (", latency_tolerance.count(), ")").c_str()); + } + std::this_thread::sleep_for((latency_tolerance - time_since_last_heartbeat)/2); + } +} + +int main(int argc, char **argv) { + struct options opt = get_options(argc, argv); + ts::mmia::cpputils::iqueue iq(opt.iqueue_path, ts::mmia::cpputils::access_mode::read_only); + optional heartbeat_thread = nullopt; + if (opt.shash_key) { + auto entry = iq.get_heartbeat_entry(*opt.shash_key); + auto const latency_tolerance = std::chrono::nanoseconds(*opt.latency_tolerance); + check_heartbeat(entry, latency_tolerance); + heartbeat_thread = std::thread(check_heartbeat_loop, entry, latency_tolerance); + } + iqueue2fd(iq, opt.outfd); +} diff --git a/src/iqueue_tail_count.cc b/src/iqueue_tail_count.cc new file mode 100644 index 0000000..d757990 --- /dev/null +++ b/src/iqueue_tail_count.cc @@ -0,0 +1,62 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include "try_unix.hh" +#include "iqueue.hh" + +struct options { + char *iqueue_path; +}; + +struct options get_options(int argc, char **argv) +try { + if (argc != 2) throw std::runtime_error("Incorrect number of arguments"); + struct options opt = { + .iqueue_path = argv[1], + }; + return opt; +} catch (std::exception const& e) { + warnx("Usage: %s ", + (argc > 0 ? argv[0] : "iqueue_tail_count")); + warnx("%s", e.what()); + exit(1); +} + +uint64_t nanos() { + using namespace std::chrono; + return duration_cast(high_resolution_clock::now().time_since_epoch()).count(); +} + +int main(int argc, char **argv) { + struct options opt = get_options(argc, argv); + ts::mmia::cpputils::iqueue iq(opt.iqueue_path, ts::mmia::cpputils::access_mode::read_only); + uint64_t id = 0; + for (;;) { + uint64_t const end = iqueue_end((iqueue_t*)iq); + if (id != end) { + uint64_t const delta = end - id; + try_(write(1, &delta, sizeof(delta))); + id = end; + } else { + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } + } +} diff --git a/src/math_utils.c b/src/math_utils.c new file mode 100644 index 0000000..d44d96e --- /dev/null +++ b/src/math_utils.c @@ -0,0 +1,37 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "math_utils.h" +uint8_t +ceilintlog2(uint64_t value) +{ + uint8_t deg = 0; + uint64_t pow2 = 1; + while (pow2 < value) { + ++deg; + pow2 <<= 1; + } + return deg; +} + +uint64_t +ceilintpow2(uint64_t value) +{ + uint64_t pow2 = 1; + while (pow2 < value) { + pow2 <<= 1; + } + return pow2; +} diff --git a/src/math_utils.h b/src/math_utils.h new file mode 100644 index 0000000..67e6b13 --- /dev/null +++ b/src/math_utils.h @@ -0,0 +1,19 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +uint64_t ceilintpow2(uint64_t value); +uint8_t ceilintlog2(uint64_t value); diff --git a/src/net_utils.c b/src/net_utils.c new file mode 100644 index 0000000..af6339f --- /dev/null +++ b/src/net_utils.c @@ -0,0 +1,146 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "tslog.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BACKLOG 250 + +/** + * Sets up a socket + * + * param #1: node + * param #2: service + * param #3: socket type, e.g. SOCK_STREAM, SOCK_DGRAM + * param #4: setup function, e.g. connect(), bind() + * + * Returns non-negative file descriptor on success, -1 on failure + */ +static int +setup_socket(const char *node, const char *service, + int (*action)(int, const struct sockaddr *, socklen_t)) +{ + struct addrinfo *aai, *ai; + int fd, ret; + + struct addrinfo hints = {}; + + hints.ai_family = PF_INET; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; + + ret = getaddrinfo(node, service, &hints, &aai); + if (ret != 0) { + TSLOGX(TSWARN, "cannot resolve %s:%s (%s)", node, service, + gai_strerror(ret)); + errno = ENOENT; + return -1; + } + if (aai == NULL) { + TSLOGX(TSWARN, "no addresses for %s:%s", node, service); + errno = ENOENT; + return -1; + } + + for (ai = aai; ai; ai = ai->ai_next) { + fd = socket(ai->ai_family, SOCK_STREAM, ai->ai_protocol); + if (fd == -1) + continue; + + int sockoptvalue = 1; + if (action == bind) { + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &sockoptvalue, sizeof(sockoptvalue)); + if (ret == -1) { + TSLOG(TSWARN, "cannot set reuseaddr"); + (void)close(fd); + fd = -1; + continue; + } + } + + ret = (*action)(fd, ai->ai_addr, ai->ai_addrlen); + if (ret == -1) { + (void)close(fd); + fd = -1; + continue; + } + break; + } + freeaddrinfo(aai); + if (fd == -1) { + TSLOG(TSWARN, "could not setup socket %s:%s of type %d", + node, service, SOCK_STREAM); + return -1; + } + + int sockopt = 1; + ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &sockopt, sizeof(sockopt)); + if (ret == -1) { + TSLOG(TSWARN, "cannot set tcp nodelay to %d", sockopt); + close(fd); + return -1; + } + return fd; +} + +/** + * Creates TCP client socket + * + * param #1: node + * param #2: service + * + * Returns non-negative file descriptor on success, -1 on failure + */ +int +tsnet_tcp_client_socket(const char *node, const char *service) +{ + assert(node != NULL); + assert(service != NULL); + return setup_socket(node, service, connect); +} + +/** + * Creates TCP server socket + * + * param #1: node + * param #2: service + * + * Returns non-negative file descriptor on success, -1 on failure + */ +int +tsnet_tcp_server_socket(const char *node, const char *service) +{ + assert(node != NULL); + assert(service != NULL); + int fd = setup_socket(node, service, bind); + if (fd == -1) + return -1; + if (listen(fd, BACKLOG) == -1) { + TSLOG(TSWARN, "cannot listen on socket %s:%s", node, service); + close(fd); + return -1; + } + return fd; +} diff --git a/src/net_utils.h b/src/net_utils.h new file mode 100644 index 0000000..f775533 --- /dev/null +++ b/src/net_utils.h @@ -0,0 +1,38 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* The need for this header can be removed by removing TCP connection + * setup support from iqsync-main, and replacing it with a Python + * script to do the setup. */ + +/** + * Creates TCP server socket + * + * param #1: node + * param #2: service + * + * Returns non-negative file descriptor on success, -1 on failure + */ +int tsnet_tcp_server_socket(const char *, const char *, int); + +/** + * Creates TCP client socket + * + * param #1: node + * param #2: service + * + * Returns non-negative file descriptor on success, -1 on failure + */ +int tsnet_tcp_client_socket(const char *, const char *, int); diff --git a/src/proc_utils.c b/src/proc_utils.c new file mode 100644 index 0000000..caefb36 --- /dev/null +++ b/src/proc_utils.c @@ -0,0 +1,105 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "proc_utils.h" +#include "stdint.h" +#include "tslog.h" + +pid_t +tsio_open3( + int fd[], + unsigned redirect_mask, + const char * file, + const char * const argv[] +) +{ + int fd_to_child[2]; + int fd_stdout_from_child[2]; + int fd_stderr_from_child[2]; + + if ((redirect_mask & TSIO_STDIN_MASK) && pipe(fd_to_child) < 0) { + TSLOG(TSERROR, "Unable to create pipe"); + return -1; + } + if ((redirect_mask & TSIO_STDOUT_MASK) && pipe(fd_stdout_from_child) < 0) { + TSLOG(TSERROR, "Unable to create pipe"); + return -1; + } + if ((redirect_mask & TSIO_STDERR_MASK) && pipe(fd_stderr_from_child) < 0) { + TSLOG(TSERROR, "Unable to create pipe"); + return -1; + } + + pid_t pid = fork(); + if (pid < 0) { + TSLOG(TSERROR, "Unable to fork child!"); + return -1; + } + + if (pid == 0) + { + // Child; copy the pipe descriptors to stdin/stdout + if (redirect_mask & TSIO_STDIN_MASK) + { + if (dup2(fd_to_child[0], STDIN_FILENO) < 0) + TSABORT("Unable to dup stdin fd"); + close(fd_to_child[0]); + close(fd_to_child[1]); + } + + if (redirect_mask & TSIO_STDOUT_MASK) + { + if (dup2(fd_stdout_from_child[1], STDOUT_FILENO) < 0) + TSABORT("Unable to dup stdout fd"); + close(fd_stdout_from_child[0]); + close(fd_stdout_from_child[1]); + } + + if (redirect_mask & TSIO_STDERR_MASK) + { + if (dup2(fd_stderr_from_child[1], STDERR_FILENO) < 0) + TSABORT("Unable to dup stderr fd"); + close(fd_stderr_from_child[0]); + close(fd_stderr_from_child[1]); + } + + execvp(file, (void*)(uintptr_t) argv); + TSABORT("Unable to exec %s", file); + } + + // Parent process continues here; clean up the dangling fds + if (redirect_mask & TSIO_STDIN_MASK) + { + fd[0] = fd_to_child[1]; + close(fd_to_child[0]); // read end + } else + fd[0] = -1; + + if (redirect_mask & TSIO_STDOUT_MASK) + { + fd[1] = fd_stdout_from_child[0]; + close(fd_stdout_from_child[1]); // write end + } else + fd[1] = -1; + + if (redirect_mask & TSIO_STDERR_MASK) + { + fd[2] = fd_stderr_from_child[0]; + close(fd_stderr_from_child[1]); // write end + } else + fd[2] = -1; + + return pid; +} diff --git a/src/proc_utils.h b/src/proc_utils.h new file mode 100644 index 0000000..26ba65b --- /dev/null +++ b/src/proc_utils.h @@ -0,0 +1,39 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* The need for this header can be removed by removing SSH connection + * setup support from iqsync-main, and replacing it with a Python + * script to do the setup. */ +#include + +#define TSIO_STDIN_MASK 1 +#define TSIO_STDOUT_MASK 2 +#define TSIO_STDERR_MASK 4 + +/** + * Spawn a process with stdin/sdout/stderr captured by the + * calling process. + * + * \return pid on success, -1 on failure. + * \param redirect_mask is a bitmask of which fds to redirect (0, 1 and 2) + * \param fd_out[] will be the child's stdin, stdout and stderr. + */ +pid_t +tsio_open3( + int fd_out[3], + unsigned redirect_mask, + const char * file, + const char * const argv[] +); diff --git a/src/shash.c b/src/shash.c index 2dbba32..e32a77c 100644 --- a/src/shash.c +++ b/src/shash.c @@ -1,8 +1,23 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include "twosigma.h" +#include #include "tslog.h" #include "shash.h" -#include "atomic.h" #include "tslock.h" @@ -334,9 +349,9 @@ shash_update( return 0; // No locks need to be held for this update - return atomic_cas_bool_64( - (void*)(uintptr_t) &entry->value, - old_value, + return atomic_compare_exchange_strong( + &entry->value, + &old_value, new_value ); } diff --git a/src/try_unix.hh b/src/try_unix.hh new file mode 100644 index 0000000..b3e7385 --- /dev/null +++ b/src/try_unix.hh @@ -0,0 +1,50 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +//// Utility functions +// a poor imitation of Rust try!() +// aborts if retval < 0, otherwise returns retval +namespace ts { namespace mmia { namespace cpputils { namespace try_unix { +inline int try_func(int retval, const char *func, int line) { + if (retval < 0) { + // err(1, "%s: %d", func, line); + warn("%s: %d", func, line); + throw std::system_error(std::error_code(errno, std::system_category())); + } + return retval; +} + +// like try_func, but doesn't abort if errno is in exclude_errors +inline int try_exclude_func(int retval, std::initializer_list exclude_errors, const char *func, int line) { + if (retval < 0) { + for (int err : exclude_errors) { + if (errno == err) return retval; + } + // err(1, "%s: %d", func, line); + warn("%s: %d", func, line); + throw std::system_error(std::error_code(errno, std::system_category())); + } + return retval; +} +}}}} + +#define try_(x) ts::mmia::cpputils::try_unix::try_func(x, __FUNCTION__, __LINE__) +#define try_exclude(x, ...) ts::mmia::cpputils::try_unix::try_exclude_func(x, __VA_ARGS__, __FUNCTION__, __LINE__) diff --git a/src/tsassert.c b/src/tsassert.c new file mode 100644 index 0000000..35e619d --- /dev/null +++ b/src/tsassert.c @@ -0,0 +1,65 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "twosigma.h" + +#include +#include +#include +#include + +#include "tsassert.h" +#include "tslog.h" + +void +tsassert_fail( + const char * const assertion, + const char * const file, + const unsigned int line, + const char * const func, + const char * fmt, + ... +) +{ + tslog_info_t info = { + .file = file, + .line = line, + .func = func, + .level = TSFATAL, + .do_perror = 0, + }; + + if (fmt != NULL) { + va_list ap; + va_start(ap, fmt); + tslog_info_vargs(&info, fmt, ap); + va_end(ap); + } + + tslog_info(&info, "Assertion `%s' failed. Backtrace:", assertion); + + void * buffer[100]; + int frames = backtrace(buffer, __arraycount(buffer)); + char ** strings = backtrace_symbols(buffer, frames); + if (!strings) + TSABORT("backtrace failed"); + + // Start at 1 so that the tsassert does not show in the backtrace + for (int i = 1 ; i < frames ; i++) + tslog_info(&info, "%d: %s", i, strings[i]); + + abort(); +} diff --git a/src/tsassert.h b/src/tsassert.h new file mode 100644 index 0000000..6c9c618 --- /dev/null +++ b/src/tsassert.h @@ -0,0 +1,62 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TSASSERT_H_ +#define _TSASSERT_H_ + +#include +#include +__BEGIN_DECLS + +__attribute__((__noreturn__,__format__(__printf__, 5, 6))) +void tsassert_fail(const char *, const char *, unsigned int, const char *, const char *, ...); + +__END_DECLS + +// like assert(), but isn't dependent on NDEBUG. i.e. it always asserts, even +// if NDEBUG is defined. this is useful when your test condition has side +// effects, e.g. see tsassert_pthread_mutex_lock below. +#define tsassert(x) \ + do { \ + if (__predict_false(! (x))) \ + tsassert_fail(__STRING(x), __FILE__, __LINE__, __func__, NULL); /* NOLINT */ \ + } while (false) + +#define tsassert_format(x, fmt, ...) \ + do { \ + if (__predict_false(! (x))) \ + tsassert_fail(__STRING(x), __FILE__, __LINE__, __func__, fmt, ## __VA_ARGS__); /*NOLINT*/ \ + } while (false) + +#define tsassert_pthread_mutex_lock(x) tsassert(pthread_mutex_lock(x) == 0) +#define tsassert_pthread_mutex_unlock(x) tsassert(pthread_mutex_unlock(x) == 0) + +/** Compile time failure if a structure is not sized correctly */ +#define size_check(t,size) static_assert(sizeof(t) == size, "Incorrect size of '" #t "'") + +#define memcpy_buf(dst, src) \ +do { \ + static_assert(sizeof(dst) >= sizeof(src), ""); \ + memcpy(dst, src, sizeof(src)); \ +} while (false) + +#define memcpy_buf_exact(dst, src) \ +do { \ + static_assert(sizeof(dst) == sizeof(src), ""); \ + memcpy(dst, src, sizeof(src)); \ +} while (false) + +#endif diff --git a/src/tsclock.h b/src/tsclock.h new file mode 100644 index 0000000..998851c --- /dev/null +++ b/src/tsclock.h @@ -0,0 +1,36 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +typedef int64_t tsclock_nanos_t; +#define TIMESPEC_TO_NANOS(ts) \ + ((ts)->tv_sec * NANOS_IN_SECOND + (ts)->tv_nsec) + +#define NANOS_IN_MICRO 1000LL +#define MICROS_IN_MILLI 1000LL +#define MILLIS_IN_SECOND 1000LL +#define NANOS_IN_MILLI (NANOS_IN_MICRO * MICROS_IN_MILLI) +#define NANOS_IN_SECOND (NANOS_IN_MILLI * MILLIS_IN_SECOND) + +static inline tsclock_nanos_t __attribute__((__always_inline__)) +tsclock_getnanos(int clockid) +{ + struct timespec ts; + if (clock_gettime(clockid, &ts) == -1) + return -1; + return TIMESPEC_TO_NANOS(&ts); +} diff --git a/src/tsdir.c b/src/tsdir.c new file mode 100644 index 0000000..1f19629 --- /dev/null +++ b/src/tsdir.c @@ -0,0 +1,127 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" +#include "tslog.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include "tsdir.h" + +int tsdir_mkdir(const char *path, mode_t mode) +{ + const char *path_p; + char *tmp; + int retval = 0; + int result; + mode_t old_umask; + + tmp = malloc(strlen(path) + 1); + + if (tmp == NULL) + return -1; + + /* skip first / */ + path_p = path+1; + old_umask = umask(0); + + for (;;) { + while (*path_p != '/') { + if (*path_p == 0) { + break; + } + ++path_p; + } + + if (*path_p == '/') { + strncpy(tmp, path, path_p - path); + tmp[path_p-path] = '\0'; + if (tmp[path_p - path - 1] != '/') { + result = mkdir(tmp, mode); + if (result == -1) { + if (!(errno == EEXIST || errno == EACCES || errno == EROFS)) { + /* Then this is a real error */ + TSLOG(TSERROR, "Error calling mkdir()"); + retval = -1; + break; + } + } + } + + /* pass / */ + path_p++; + + } else { + /* last component */ + result = mkdir(path, mode); + + if (result == -1) { + if (errno == EEXIST) { + int result2; + + /* If it exists, make sure it really is a directory. */ + result2 = tsdir_exists(path); + if (result2 == -1) { + TSLOGX(TSERROR, "Error calling tsdir_exists()"); + retval = -1; + } + else if (result2 != 1) { + TSLOGX(TSERROR, "Something with this name exists, but it is not a directory."); + retval = -1; + } + } + else { + TSLOG(TSERROR, "Error calling mkdir()"); + retval = -1; + } + } + + break; + } + } + + free(tmp); + umask(old_umask); + + return retval; +} + +int tsdir_exists(const char *dir) +{ + int result; + struct stat st; + + result = stat(dir, &st); + if (result == -1 && errno == ENOENT) { + return 0; + } + else if (result == -1) { + TSLOG(TSERROR, "Error calling stat()"); + return -1; + } + + if (!S_ISDIR(st.st_mode)) { + return 0; + } + + return 1; +} diff --git a/src/tsdir.h b/src/tsdir.h new file mode 100644 index 0000000..bbd6951 --- /dev/null +++ b/src/tsdir.h @@ -0,0 +1,50 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TSDIR_H_ +#define _TSDIR_H_ + +#include + +/* Like the mkdir() system call, but make all parent directories + * if they don't exist. Also, don't fail if the directory already + * exists. This is like the "mkdir -p" shell command. + * + * @path: the absolute or relative path of the directory to create + * @mode: the mode of the created directories, identical to the + * mode argument of the mkdir() system call + * + * Return value: + * -1: Error (other than the directory already existed) + * 0: Success + */ + +int tsdir_mkdir(const char *path, mode_t mode); + + +/* Check if a directory exists + * + * @dir: the directory to test + * + * Return value: + * -1: Error + * 0: Directory does not exist + * 1: Directory exists + */ + +int tsdir_exists(const char *dir); + +#endif /* TSDIR_H */ diff --git a/src/tsflexhash.c b/src/tsflexhash.c new file mode 100644 index 0000000..d938135 --- /dev/null +++ b/src/tsflexhash.c @@ -0,0 +1,299 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tslog.h" +#include "assert.h" +#include "err.h" +#include "tsflexhash_private.h" + + +typedef struct entry_block entry_block_t; + +struct entry_block { + entry_block_t *next; +}; + +typedef struct { + uint32_t mask, size; + void *pool; + size_t entry_size, next_offset; + const char *name; + void *(*malloc_function)(size_t); + void (*free_function)(void *); + entry_block_t *entry_blocks; + uint32_t collision_count; + uint32_t capacity_bit_count; + uint64_t chain_link_traversal_count; + uint64_t align; +} tsflexhash_t; + +typedef struct { + uint32_t mask, size; + void *pool; + size_t entry_size, next_offset; + const char *name; + void *(*malloc_function)(size_t); + void (*free_function)(void *); + entry_block_t *entry_blocks; + uint32_t collision_count; + uint32_t capacity_bit_count; + uint64_t chain_link_traversal_count; + void *entries; +} tsflexhash_new_t; + +// this makes sure all pages are faulted in, and it checks that the malloc +// function is returning memory that is zeroed out. +static void +memtouch(void *p, size_t len) +{ + TSLOGX(TSDIAG, "Checking %zu bytes at %p", len, p); + assert((len & 0x3) == 0); + uint32_t *i = p; + uint32_t *i1 = i + (len >> 2); + while (i < i1) { + if (*i != 0) { + errx(1, "Prefaulted byte %p is nonzero", i); + } + i++; + } + TSLOGX(TSDIAG, "%zu bytes at %p are zero", len, p); +} + +void * +tsflexhash_create_norehash(size_t entry_size, size_t next_offset, uint32_t capacity, + const char *name, void *(*malloc_function)(size_t), + void (*free_function)(void *)) +{ + TSLOGX(TSDEBUG, "Creating %s hashtable of size %" PRIu32 ". Entry size: " + "%zu. Next offset: %zu", name, capacity, entry_size, next_offset); + assert(capacity != 0); + uint32_t mask = capacity - 1; + if ((capacity & mask) != 0) { + TSLOGX(TSERROR, "Capacity is not a power of two: %" PRIu32, capacity); + abort(); + } + // malloc() used to be okay, but once we started using + // tsflexhash_replenish_pool_norehash(), we need memory to be zeroed out. + if (malloc_function == &malloc) { + TSLOGX(TSERROR, "malloc() is not a valid memory allocator."); + abort(); + } + size_t malloc_size = sizeof(tsflexhash_t) - sizeof(uint64_t) + ((capacity + 1) * entry_size); + TSLOGX(TSDEBUG, "malloc_size: %zu", malloc_size); + tsflexhash_t *hashtable = malloc_function(malloc_size); + assert(hashtable != NULL); + memtouch(hashtable, malloc_size); + hashtable->mask = mask; + hashtable->entry_size = entry_size; + hashtable->next_offset = next_offset; + hashtable->name = name; + hashtable->malloc_function = malloc_function; + hashtable->free_function = free_function; + hashtable->pool = TSFLEXHASH_SENTINEL; + hashtable->collision_count = 0; + hashtable->chain_link_traversal_count = 0; + hashtable->capacity_bit_count = ffs(capacity) - 1; + tsflexhash_replenish_pool_norehash(hashtable, true); + return hashtable; +} + +void * +tsflexhash_create_rehash(size_t entry_size, size_t next_offset, uint32_t capacity, + const char *name, void *(*malloc_function)(size_t), + void (*free_function)(void *)) +{ + TSLOGX(TSDEBUG, "Creating %s hashtable of size %" PRIu32 ". Entry size: " + "%zu. Next offset: %zu", name, capacity, entry_size, next_offset); + assert(capacity != 0); + uint32_t mask = capacity - 1; + if ((capacity & mask) != 0) { + TSLOGX(TSERROR, "Capacity is not a power of two: %" PRIu32, capacity); + abort(); + } + // malloc() used to be okay, but once we started using + // tsflexhash_replenish_pool_rehash(), we need memory to be zeroed out. + if (malloc_function == &malloc) { + TSLOGX(TSERROR, "malloc() is not a valid memory allocator."); + abort(); + } + size_t malloc_size = sizeof(tsflexhash_t) + (capacity * entry_size); + TSLOGX(TSDEBUG, "malloc_size: %zu", malloc_size); + tsflexhash_new_t *hashtable = malloc_function(sizeof(tsflexhash_new_t)); + assert(hashtable != NULL); + hashtable->mask = mask; + hashtable->entry_size = entry_size; + hashtable->next_offset = next_offset; + hashtable->name = name; + hashtable->malloc_function = malloc_function; + hashtable->free_function = free_function; + hashtable->pool = TSFLEXHASH_SENTINEL; + hashtable->entries = malloc_function(capacity * entry_size); + assert(hashtable->entries != NULL); + memtouch(hashtable->entries, capacity * entry_size); + hashtable->capacity_bit_count = ffs(capacity) - 1; + hashtable->collision_count = 0; + hashtable->chain_link_traversal_count = 0; + tsflexhash_replenish_pool_rehash(hashtable, true); + return hashtable; +} + +void +tsflexhash_destroy_norehash(void *p) +{ + tsflexhash_t *hashtable = p; + entry_block_t *entry_block = hashtable->entry_blocks; + while (entry_block != NULL) { + entry_block_t *next = entry_block->next; + hashtable->free_function(entry_block); + entry_block = next; + } + hashtable->free_function(hashtable); +} + +void +tsflexhash_destroy_rehash(void *p) +{ + tsflexhash_new_t *hashtable = p; + entry_block_t *entry_block = hashtable->entry_blocks; + while (entry_block != NULL) { + entry_block_t *next = entry_block->next; + hashtable->free_function(entry_block); + entry_block = next; + } + hashtable->free_function(hashtable->entries); + hashtable->free_function(hashtable); +} + +// this new version of tsflexhash_replenish_pool uses TSFLEXHASH_SENTINEL to +// mark the end of the list (instead of NULL). this allows us to add blocks of +// entries to the pool without having to touch each one. see tsflexhash_insert() +// for more information +void * +tsflexhash_replenish_pool_norehash(void *p, bool first) +{ + tsflexhash_t *hashtable = p; + uint32_t replenish_count = (hashtable->mask + 1) / 4; + if (replenish_count == 0) { + replenish_count = 1; + } + size_t malloc_size = sizeof(entry_block_t) + + (replenish_count * hashtable->entry_size); + // use TSINFO here when the pool is replenished during use (i.e. not during + // creation), since this means your table is over-loaded and you should + // probably increase its size. + int level = first ? TSDEBUG : TSINFO; + TSLOGX(level, "Replenishing pool for %s. Will malloc %zu bytes " + "for %" PRIu32 " entries. Table currently has %" PRIu32 + " entries and a capacity of %" PRIu32 ".", + hashtable->name, + malloc_size, + replenish_count, + hashtable->size, + hashtable->mask + 1); + assert(hashtable->pool == TSFLEXHASH_SENTINEL); + entry_block_t *block = hashtable->malloc_function(malloc_size); + assert(block != NULL); + if (first) { + memtouch(block, malloc_size); + } + assert(block->next == NULL); // this memory should be all zeroes + + // update the linked list of blocks (used in tsflexhash_destroy) + block->next = hashtable->entry_blocks; + hashtable->entry_blocks = block; + + // update the hashtable's pool pointer + void *entry = block + 1; + hashtable->pool = entry; + + // find the 'next' pointer in the last entry in the block, and set it + // to the sentinel value. this marks the end of the block + char *last_entry = ((char *) entry) + + (hashtable->entry_size * (replenish_count - 1)); + void **next = (void *) (last_entry + hashtable->next_offset); + assert(*next == NULL); // this memory should be all zeroes + *next = TSFLEXHASH_SENTINEL; + + return hashtable->pool; +} + +void * +tsflexhash_replenish_pool_rehash(void *p, bool first) +{ + tsflexhash_new_t *hashtable = p; + uint32_t replenish_count = (hashtable->mask + 1) / 4; + if (replenish_count == 0) { + replenish_count = 1; + } + size_t malloc_size = sizeof(entry_block_t) + + (replenish_count * hashtable->entry_size); + + // For rehashing tables, don't print messages about the table growing + + assert(hashtable->pool == TSFLEXHASH_SENTINEL); + entry_block_t *block = hashtable->malloc_function(malloc_size); + assert(block != NULL); + if (first) { + memtouch(block, malloc_size); + } + assert(block->next == NULL); // this memory should be all zeroes + + // update the linked list of blocks (used in tsflexhash_destroy) + block->next = hashtable->entry_blocks; + hashtable->entry_blocks = block; + + // update the hashtable's pool pointer + void *entry = block + 1; + hashtable->pool = entry; + + // find the 'next' pointer in the last entry in the block, and set it + // to the sentinel value. this marks the end of the block + char *last_entry = ((char *) entry) + + (hashtable->entry_size * (replenish_count - 1)); + void **next = (void *) (last_entry + hashtable->next_offset); + assert(*next == NULL); // this memory should be all zeroes + *next = TSFLEXHASH_SENTINEL; + + return hashtable->pool; +} + +// simple malloc function. note that it must return memory that is zeroed out. +void * +tsflexhash_malloc(size_t size) +{ + void * p = calloc(1, size); + if (p == NULL) { + TSABORT("Failure allocating %zu bytes", size); + } + return p; +} + +void +tsflexhash_free(void *p) +{ + assert(p != NULL); + free(p); +} diff --git a/src/tsflexhash.h b/src/tsflexhash.h new file mode 100644 index 0000000..d990367 --- /dev/null +++ b/src/tsflexhash.h @@ -0,0 +1,823 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A chaining hashtable that can be customized and optimized for different + * key/value types. This code is designed to be used in performance-sensitive + * applications. + * + * This header file works like a C++ template. Each time you include this + * header file you will be defining a new + * type of hashtable and a new set of functions that go along with it. The + * type and the function names will all be prefixed with a name you define via + * a macro. The features of the table are also controlled by macros you define + * before including this header file. Here is an example of how this works: + * + * #include + * + * typedef struct { + * int order_id; + * const char *ticker; + * int shares; + * } order_t; + * + * #define TSFLEXHASH_NAME order_table + * #define TSFLEXHASH_KEY_TYPE int + * #define TSFLEXHASH_VAL_TYPE order_t + * #include "tsflexhash.h" + * + * static order_table_t *ot; + * static int next_order_id = 1; + * + * int main() { + * ot = order_table_create(4096); + * } + * + * order_t *create_order(const char *ticker, int shares) { + * order_t *order = order_table_insert(ot, next_order_id); + * order->id = next_order_id++; + * order->ticker = ticker; + * order->shares = shares; + * return order; + * } + * + * order_t *get_order(int order_id) { + * return order_table_get(ot, order_id); + * } + * + * order_t *kill_order(int order_id) { + * order = order_table_remove(ot, order_id); + * } + * + * void print_orders(void) { + * printf("total order_count: %" PRIu32 "\n", order_table_size(ot)); + * order_table_iterator_t i; + * order_table_iterator_init(ot, &i); + * for (;;) { + * int order_id; + * order_t *order = order_table_iterator_next(ot, &i, &order_id); + * if (order == NULL) { + * break; + * } + * assert(order_id == order->id); + * printf("%d: %d shares of %s\n", order->id, order->shares, order_ticker); + * } + * } + * + * #define TSFLEXHASH_AUTO_REHASH 1 + * #include "tsflexhash.h" + * + * void create_order_with_auto_rehash(const char *ticker, int shares) { + * order_table_create(32); + * for (int i = 0; i < 4096; i++) { + * order_t *order = order_table_insert(ot, next_order_id); // rehash will auto happen here. + * order->id = next_order_id++; + * order->ticker = ticker; + * order->shares = shares; + * } + * } + * + * #define TSFLEXHASH_REHASH 1 + * #include "tsflexhash.h" + * void create_order_with_rehash(const char *ticker, int shares) { + * ot = order_table_create(256); + * for (int j = 0; j < 16; j++) + * for (int i = 0; i < 256; i++) { + * order_t *order = order_table_insert(ot, next_order_id); // rehash will auto happen here. + * order->id = next_order_id++; + * order->ticker = ticker; + * order->shares = shares; + * } + * order_table_rehash(ot); + * } + * } + */ + +#include +#include +#include +#include +#include "tsflexhash_private.h" +#include "tsgosmacs.h" +#include "twosigma.h" + +__BEGIN_DECLS + +// a name like order_table +#ifndef TSFLEXHASH_NAME +# error "TSFLEXHASH_NAME not defined" +#endif /* TSFLEXHASH_NAME */ + +// set this if you are using char * keys and want the standard behavior +// (gosmacs hash function, NULL keys not allowed) +#ifndef TSFLEXHASH_STRING_KEYS +# define TSFLEXHASH_STRING_KEYS 0 +#endif /* TSFLEXHASH_STRING_KEYS */ + +// same as above, but keys are const char * +#ifndef TSFLEXHASH_CONST_STRING_KEYS +# define TSFLEXHASH_CONST_STRING_KEYS 0 +#endif /* TSFLEXHASH_CONST_STRING_KEYS */ + +#if TSFLEXHASH_STRING_KEYS +# define TSFLEXHASH_KEY_TYPE char * +# define TSFLEXHASH_KEY_CONST 0 +#elif TSFLEXHASH_CONST_STRING_KEYS +# define TSFLEXHASH_KEY_TYPE const char * +# define TSFLEXHASH_KEY_CONST 1 +#endif /* TSFLEXHASH_STRING_KEYS */ + +#if TSFLEXHASH_STRING_KEYS || TSFLEXHASH_CONST_STRING_KEYS +# define TSFLEXHASH_KEY_EQUALS(a,b) strcmp(a,b) == 0 +# define TSFLEXHASH_KEY_HASH(k) gosmacs_hash(k) +#endif /* TSFLEXHASH_STRING_KEYS */ + +// a type like int64_t or char * +#ifndef TSFLEXHASH_KEY_TYPE +# error "TSFLEXHASH_KEY_TYPE not defined" +#endif /* TSFLEXHASH_KEY_TYPE */ + +// Optional: set this to 1 if your key type includes a "const" qualifier, 0 if +// it doesn't. If you set it to 0, "const" to be added to the key parameters +// of get() and remove(), which lets you pass const arguments in (e.g. string +// literals) +#ifndef TSFLEXHASH_KEY_CONST +# define TSFLEXHASH_GET_KEY_QUALIFIER +#elif TSFLEXHASH_KEY_CONST +# define TSFLEXHASH_GET_KEY_QUALIFIER +#else +# define TSFLEXHASH_GET_KEY_QUALIFIER const +#endif /* TSFLEXHASH_KEY_CONST */ + +// if you need to free your keys, set this to 1, and the remove function will +// have an extra argument that gives you a pointer to the stored key +#ifndef TSFLEXHASH_REMOVE_RETURNS_KEY +# define TSFLEXHASH_REMOVE_RETURNS_KEY 0 +#endif /* TSFLEXHASH_REMOVE_RETURNS_KEY */ + +// if you want to use erase instead of remove. set this to 1. +// unlike remove, erase returns nothing and zeroes out the whole entry, +// so your insert is guaranteed to give you zero buffer when using erase, +// noted that erase is more expensive than remove because it zeroes out +// the entire entry rather than only set the invalid bit +#ifndef TSFLEXHASH_USE_ERASE +# define TSFLEXHASH_USE_ERASE 0 +#endif /* TSFLEXHASH_USE_ERASE */ + +// an optional comparison function. this is only needed if your keys are +// pointers and you want to compare the values +#ifndef TSFLEXHASH_KEY_EQUALS +# define TSFLEXHASH_KEY_EQUALS(a,b) a == b +#endif /* TSFLEXHASH_KEY_EQUALS */ + +// an optional hashcode function. this is needed if your keys are pointers. +// you may also want to provide it if the default implementation results in too +// many collisions +// TSFLEXHASH_KEY_HASH should take one argument, key, and return hashed value of the key +// TSFLEXHASH_KEY_HASH_EXT should take two arguments, hashtable and key, and return hashed value of the key +#if !(defined TSFLEXHASH_KEY_HASH || defined TSFLEXHASH_KEY_HASH_EXT) +# define TSFLEXHASH_KEY_HASH(key) key +#endif /* TSFLEXHASH_KEY_HASH */ + +// the type of values stored in the table, e.g. order_t. +#ifndef TSFLEXHASH_VAL_TYPE +# error "TSFLEXHASH_VAL_TYPE not defined" +#endif /* TSFLEXHASH_VAL_TYPE */ + +// the size of your value type. this is only needed if you want to override the +// default size of sizeof(val_type) +// TSFLEXHASH_VAL_SIZE + +// set to non-zero if you want to store pointers in the table, or primitive +// values, rather than structs. this changes the API for get() and insert() +// functions +#ifndef TSFLEXHASH_COPY_VAL +# define TSFLEXHASH_COPY_VAL 0 +#endif /* TSFLEXHASH_COPY_VAL */ + +// set this if you want get(), remove(), and iterator_next() to return something +// other than 0 when there is no entry to return +#ifndef TSFLEXHASH_EMPTY_RETURN_VAL +# define TSFLEXHASH_EMPTY_RETURN_VAL (TSFLEXHASH_RETURN_TYPE) 0 +#endif /* TSFLEXHASH_EMPTY_RETURN_VAL */ + +// set this to 0 if you want to speed up insert/remove operations by not +// incrementing/decrementing the hashtable size counter +#ifndef TSFLEXHASH_UPDATE_SIZE +# define TSFLEXHASH_UPDATE_SIZE 1 +#endif /* TSFLEXHASH_UPDATE_SIZE */ + +// set this to 1 if you want to count number of collisions, +// and measure the sum of numbers of links in the chain +// that have to be traversed because of collisions +#ifndef TSFLEXHASH_COUNT_COLLISION +# define TSFLEXHASH_COUNT_COLLISION 0 +#endif /* TSFLEXHASH_COUNT_COLLISION */ + +// the optional capacity of your hashtable. this must be a power of two. if you +// don't specify the capacity here, you must pass it as an argument to the +// create function +#ifndef TSFLEXHASH_CAPACITY +# define TSFLEXHASH_MASK(hashtable) ((hashtable)->mask) +#else +# define TSFLEXHASH_MASK(hashtable) ((TSFLEXHASH_CAPACITY) - 1) +#endif /* TSFLEXHASH_CAPACITY */ + +// set this to 1 if you want to use tsnuma_malloc() and tsnuma_free() +#if defined(TSFLEXHASH_NUMA) && TSFLEXHASH_NUMA +// tsnuma is broken, because according to Valgrind libnuma is broken +# if defined(JNI_TRUE) +# error "You cannot use tsnuma in JNI code. It's broken!!!!" +# endif /* JNI_TRUE */ +# include "tsnuma.h" +# define TSFLEXHASH_MALLOC tsnuma_malloc +# define TSFLEXHASH_FREE tsnuma_free + +#endif /* TSFLEXHASH_NUMA */ + +// set these if you want the hashtable to use custom malloc() and free() +// functions. note that the malloc function you specify *must* return zeroed-out +// memory, otherwise the hashtable will become corrupted +#ifndef TSFLEXHASH_MALLOC +# define TSFLEXHASH_MALLOC tsflexhash_malloc +#endif /* TSFLEXHASH_MALLOC */ + +#ifndef TSFLEXHASH_FREE +# define TSFLEXHASH_FREE tsflexhash_free +#endif /* TSFLEXHASH_FREE */ + + +// -------- no user serviceable parts below this line --------- // + + +#define TSFLEXHASH_NAME_LITERAL1(name) # name +#define TSFLEXHASH_NAME_LITERAL0(name) TSFLEXHASH_NAME_LITERAL1(name) +#define TSFLEXHASH_NAME_LITERAL TSFLEXHASH_NAME_LITERAL0(TSFLEXHASH_NAME) + +#define TSFLEXHASH_MANGLE1(name,foo) name ## _ ## foo +#define TSFLEXHASH_MANGLE0(name,foo) TSFLEXHASH_MANGLE1(name,foo) +#define TSFLEXHASH_MANGLE(foo) TSFLEXHASH_MANGLE0(TSFLEXHASH_NAME,foo) + +#define TSFLEXHASH_ENTRY_T TSFLEXHASH_MANGLE(entry_t) +#define TSFLEXHASH_T TSFLEXHASH_MANGLE(t) + +#if TSFLEXHASH_COPY_VAL +# define TSFLEXHASH_RETURN_VALUE(entry) entry->val.obj; +# define TSFLEXHASH_RETURN_TYPE TSFLEXHASH_VAL_TYPE +#else +# define TSFLEXHASH_RETURN_VALUE(entry) &entry->val.obj; +# define TSFLEXHASH_RETURN_TYPE TSFLEXHASH_VAL_TYPE * +#endif /* TSFLEXHASH_COPY_VAL */ + +#ifdef TSFLEXHASH_AUTO_REHASH +# ifdef TSFLEXHASH_CAPACITY +# error "You cannot set AUTO_REHASH and CAPACITY at the same time" +# endif /* TSFLEXHASH_CAPACITY */ +# ifndef TSFLEXHASH_REHASH +# define TSFLEXHASH_REHASH 1 +# endif /* TSFLEXHASH_REHASH */ +#endif /* TSFLEXHASH_AUTO_REHASH */ + +#ifdef TSFLEXHASH_KEY_HASH +# define TSFLEXHASH_ENTRY(hashtable,key) (&hashtable->entries[\ + ((uint32_t) TSFLEXHASH_KEY_HASH(key)) & \ + ((uint32_t) TSFLEXHASH_MASK(hashtable))]) +#else +# define TSFLEXHASH_ENTRY(hashtable,key) (&hashtable->entries[\ + ((uint32_t) TSFLEXHASH_KEY_HASH_EXT(hashtable, key)) & \ + ((uint32_t) TSFLEXHASH_MASK(hashtable))]) +#endif /* TSFLEXHASH_KEY_HASH */ + +#define TSFLEXHASH_ZERO_ENTRY(entry) \ + memset(&entry->val.obj, 0, sizeof(entry->val.obj)) + +typedef struct TSFLEXHASH_MANGLE(entry) TSFLEXHASH_MANGLE(entry_t); + +struct TSFLEXHASH_MANGLE(entry) { + TSFLEXHASH_ENTRY_T *next; + TSFLEXHASH_KEY_TYPE key; + + union { + TSFLEXHASH_VAL_TYPE obj; +#ifdef TSFLEXHASH_VAL_SIZE + // if the user wants more space for each object, this pads the entry to + // accomodate that + char bytes[TSFLEXHASH_VAL_SIZE]; +#endif /* TSFLEXHASH_VAL_SIZE */ + } val; + +#ifdef TSFLEXHASH_REHASH + // quotient is for rehash, when rehash doubles the size of capacity, + // quotient will be used to direct what is the new location for an entry. + uint32_t quotient; +#endif /* TSFLEXHASH_REHASH */ +}; + +// we use the lowest bit of entry->next as a boolean to indicate whether +// the entry is valid. +#define TSFLEXHASH_IS_VALID(entry) (((uintptr_t) (entry)->next) & 0x1) +#define TSFLEXHASH_NEXT(entry) (__REINTERPRET_CAST(TSFLEXHASH_ENTRY_T *, (((uintptr_t) (entry)->next) & \ + (~((uintptr_t) 0x1))))) +#define TSFLEXHASH_MARK_VALID(entry) ((entry)->next = \ + __REINTERPRET_CAST(TSFLEXHASH_ENTRY_T *, (__REINTERPRET_CAST(uintptr_t, (entry)->next) | 0x1))) +#define TSFLEXHASH_MARK_INVALID(entry) ((entry)->next = \ + __REINTERPRET_CAST(TSFLEXHASH_ENTRY_T *, (__REINTERPRET_CAST(uintptr_t, (entry)->next) & (~((uintptr_t) 0x1))))) + +typedef struct TSFLEXHASH_NAME { + uint32_t mask; + uint32_t size; + // pointer to the head of a linked-list of unused entry objects + TSFLEXHASH_ENTRY_T *pool; + // pointer values used by replenish_pool to malloc new entries and chain + // them together + const size_t entry_size, next_offset; + // for debugging purposes. a string literal of TSFLEXHASH_NAME + const char *name; + void *(*malloc_function)(size_t); + void (*free_function)(void *); + // head of a linked list of overflow entry blocks. used by destroy() + void *entry_blocks; + // an array of entries. create() allocates the correct amount of space for + // these (i.e. not 0) + uint32_t collision_count; + uint32_t capacity_bit_count; + uint64_t chain_link_traversal_count; +#ifdef TSFLEXHASH_REHASH + TSFLEXHASH_ENTRY_T *entries; +#else + TSFLEXHASH_ENTRY_T entries[0]; +#endif /* TSFLEXHASH_REHASH */ + +} +// this will break create() +//__attribute__((aligned(TSFLEXHASH_ENTRY_ALIGNMENT))) +TSFLEXHASH_T; + +typedef void* +(*(TSFLEXHASH_MANGLE(create_func)))(size_t, + size_t, + uint32_t, + const char *, + void *(*)(size_t), + void (*)(void *)); + +static inline __attribute__((__always_inline__)) uint32_t +TSFLEXHASH_MANGLE(capacity)(TSFLEXHASH_T *hashtable) +{ + return hashtable->mask + 1; +} + +static inline __attribute__((__always_inline__)) uint32_t +TSFLEXHASH_MANGLE(size)(TSFLEXHASH_T *hashtable) +{ + return hashtable->size; +} + +static inline __attribute__((__always_inline__)) double +TSFLEXHASH_MANGLE(load_factor)(TSFLEXHASH_T *hashtable) +{ + return ((double) hashtable->size) / (hashtable->mask + 1); +} + +static inline __attribute__((__always_inline__)) TSFLEXHASH_T * +#ifdef TSFLEXHASH_CAPACITY +TSFLEXHASH_MANGLE(create)(void) +#else +TSFLEXHASH_MANGLE(create)(uint32_t capacity) +#endif /* TSFLEXHASH_CAPACITY */ +{ + TSFLEXHASH_MANGLE(create_func) create_func = NULL; +#ifdef TSFLEXHASH_REHASH + create_func = tsflexhash_create_rehash; +#else + create_func = tsflexhash_create_norehash; +#endif /* TSFLEXHASH_REHASH */ + + return (TSFLEXHASH_T *) create_func(sizeof(TSFLEXHASH_ENTRY_T), + offsetof(TSFLEXHASH_ENTRY_T, next), +#ifdef TSFLEXHASH_CAPACITY + TSFLEXHASH_CAPACITY, +#else + capacity, +#endif /* TSFLEXHASH_CAPACITY */ + TSFLEXHASH_NAME_LITERAL, + TSFLEXHASH_MALLOC, + TSFLEXHASH_FREE + ); +} + +static inline __attribute__((__always_inline__)) void +TSFLEXHASH_MANGLE(destroy)(TSFLEXHASH_T *hashtable) +{ +#ifdef TSFLEXHASH_REHASH + tsflexhash_destroy_rehash(hashtable); +#else + tsflexhash_destroy_norehash(hashtable); +#endif /* TSFLEXHASH_REHASH */ +} + +static inline __attribute__((__always_inline__)) TSFLEXHASH_RETURN_TYPE +TSFLEXHASH_MANGLE(get)(TSFLEXHASH_T *hashtable, + TSFLEXHASH_GET_KEY_QUALIFIER TSFLEXHASH_KEY_TYPE key) +{ + // index into the entry array + TSFLEXHASH_ENTRY_T *entry = TSFLEXHASH_ENTRY(hashtable, key); + for (;;) { + if (__predict_true(TSFLEXHASH_IS_VALID(entry) && + TSFLEXHASH_KEY_EQUALS(entry->key, key))) + { + // keys match. return this entry's value + return TSFLEXHASH_RETURN_VALUE(entry); + } + // keys don't match. there was a collision. go to the next entry in the + // chain +#ifdef TSFLEXHASH_COUNT_COLLISION + hashtable->chain_link_traversal_count++; +#endif /* TSFLEXHASH_COUNT_COLLISION */ + entry = TSFLEXHASH_NEXT(entry); + if (__predict_true(entry == NULL)) { + // no more entries. the key does not exist in the hashtable + return TSFLEXHASH_EMPTY_RETURN_VAL; + } + } +} + +#ifdef TSFLEXHASH_REHASH +# define MAX_HASH_CAPACITY (1<<30) +static inline __attribute__((__always_inline__)) +void +TSFLEXHASH_MANGLE(rehash)(TSFLEXHASH_T *hashtable) +{ + uint32_t capacity = hashtable->mask + 1; + uint32_t new_capacity = capacity << 1; + if (new_capacity > MAX_HASH_CAPACITY || new_capacity == 0) { + return; + } + + TSFLEXHASH_ENTRY_T *entry_table = __STATIC_CAST(TSFLEXHASH_ENTRY_T *, hashtable->malloc_function(new_capacity * hashtable->entry_size)); + if (entry_table == NULL) { + return; + } +# if TSFLEXHASH_COUNT_COLLISION + hashtable->chain_link_traversal_count = 0; + hashtable->collision_count = 0; +# endif /* TSFLEXHASH_COUNT_COLLISION */ + + for (uint32_t i = 0; i < capacity; i++) { + TSFLEXHASH_ENTRY_T *entry = &hashtable->entries[i]; + if (TSFLEXHASH_IS_VALID(entry) == 0) { + continue; + } + + while (entry != NULL) { + TSFLEXHASH_ENTRY_T *next_entry = TSFLEXHASH_NEXT(entry); + TSFLEXHASH_ENTRY_T *new_entry = &entry_table[i]; + if ((entry->quotient & 0x01) != 0) { + new_entry = &entry_table[i + capacity]; + } + + entry->quotient >>= 1; + + if (TSFLEXHASH_IS_VALID(new_entry) == 0) { + new_entry->key = entry->key; + new_entry->quotient = entry->quotient; +# ifdef TSFLEXHASH_VAL_SIZE + memcpy(&new_entry->val, &entry->val, sizeof(entry->val)); +# else + new_entry->val = entry->val; +# endif /* TSFLEXHASH_VAL_SIZE */ + } else { + TSFLEXHASH_ENTRY_T *next_new_entry = TSFLEXHASH_NEXT(new_entry); + while (next_new_entry != NULL) { + new_entry = next_new_entry; + next_new_entry = TSFLEXHASH_NEXT(new_entry); + } + new_entry->next = entry; + entry->next = NULL; + TSFLEXHASH_MARK_VALID(entry); +# if TSFLEXHASH_COUNT_COLLISION + hashtable->collision_count++; +# endif /* TSFLEXHASH_COUNT_COLLISION */ + } + TSFLEXHASH_MARK_VALID(new_entry); + entry = next_entry; + } + } + + hashtable->free_function(hashtable->entries); + hashtable->entries = entry_table; + hashtable->mask = new_capacity - 1; + hashtable->capacity_bit_count++; +} +#endif /* TSFLEXHASH_REHASH */ + +static inline __attribute__((__always_inline__)) +#if TSFLEXHASH_COPY_VAL +void +TSFLEXHASH_MANGLE(insert)(TSFLEXHASH_T *hashtable, TSFLEXHASH_KEY_TYPE key, + TSFLEXHASH_VAL_TYPE val) +#else +TSFLEXHASH_RETURN_TYPE +TSFLEXHASH_MANGLE(insert)(TSFLEXHASH_T *hashtable, TSFLEXHASH_KEY_TYPE key) +#endif /* TSFLEXHASH_COPY_VAL */ +{ + +#ifdef TSFLEXHASH_AUTO_REHASH + // Check if 112% full for the current table. + uint32_t hash_capacity = hashtable->mask + 1; + if (hashtable->size > (hash_capacity >> 3) + hash_capacity) { + TSFLEXHASH_MANGLE(rehash)(hashtable); + } +#endif /* TSFLEXHASH_AUTO_REHASH */ + + // compute the hash function. +#ifdef TSFLEXHASH_KEY_HASH + uint32_t key_hash = ((uint32_t)TSFLEXHASH_KEY_HASH(key)); +#else // TSFLEXHASH_KEY_HASH_EXT + uint32_t key_hash = ((uint32_t)TSFLEXHASH_KEY_HASH_EXT(hashtable, key)); +#endif /* TSFLEXHASH_KEY_HASH */ + + // index into the array + TSFLEXHASH_ENTRY_T *entry = &hashtable->entries[key_hash & ((uint32_t)TSFLEXHASH_MASK(hashtable))]; + if (__predict_false(TSFLEXHASH_IS_VALID(entry))) { + // the entry contains a value (collision). so grab a free entry from the + // pool and insert it into the chain (at the head, since that's faster) + TSFLEXHASH_ENTRY_T *new_entry = hashtable->pool; + if (__predict_false(new_entry == TSFLEXHASH_SENTINEL)) { + // oops, the pool was empty. malloc a whole bunch of new entries. + // this is slow. +#ifdef TSFLEXHASH_REHASH + new_entry = ((TSFLEXHASH_ENTRY_T *) tsflexhash_replenish_pool_rehash( + hashtable, false)); +#else + new_entry = ((TSFLEXHASH_ENTRY_T *) tsflexhash_replenish_pool_norehash( + hashtable, false)); +#endif /* TSFLEXHASH_REHASH */ + } + if (new_entry->next == NULL) { + hashtable->pool = new_entry + 1; + } else { + hashtable->pool = new_entry->next; + } + new_entry->next = entry->next; + entry->next = new_entry; + TSFLEXHASH_MARK_VALID(entry); + entry = new_entry; +#if TSFLEXHASH_COUNT_COLLISION + hashtable->collision_count++; +#endif /* TSFLEXHASH_COUNT_COLLISION */ + } // else { (key == 0) means the entry is empty } + entry->key = key; +#ifdef TSFLEXHASH_REHASH + uint32_t quotient = (key_hash >> (hashtable->capacity_bit_count)); + entry->quotient = quotient; +#endif /* TSFLEXHASH_REHASH */ + TSFLEXHASH_MARK_VALID(entry); +#if TSFLEXHASH_UPDATE_SIZE + hashtable->size++; +#endif /* TSFLEXHASH_UPDATE_SIZE */ +#if TSFLEXHASH_COPY_VAL + entry->val.obj = val; +#else + return TSFLEXHASH_RETURN_VALUE(entry); +#endif /* TSFLEXHASH_COPY_VAL */ +} + +static inline __attribute__((__always_inline__)) +#if TSFLEXHASH_USE_ERASE + void +TSFLEXHASH_MANGLE(erase)(TSFLEXHASH_T *hashtable, +#else + TSFLEXHASH_RETURN_TYPE +TSFLEXHASH_MANGLE(remove)(TSFLEXHASH_T *hashtable, +#endif /* TSFLEXHASH_USE_ERASE */ + TSFLEXHASH_GET_KEY_QUALIFIER TSFLEXHASH_KEY_TYPE key +#if TSFLEXHASH_REMOVE_RETURNS_KEY +, TSFLEXHASH_KEY_TYPE *pkey +#endif /* TSFLEXHASH_REMOVE_RETURNS_KEY */ +) +{ + TSFLEXHASH_ENTRY_T *entry, *alt; + // index into the array + entry = TSFLEXHASH_ENTRY(hashtable, key); + if (__predict_true(TSFLEXHASH_IS_VALID(entry) && + TSFLEXHASH_KEY_EQUALS(entry->key, key))) + { + // keys matched. return this entry +#if TSFLEXHASH_UPDATE_SIZE + hashtable->size--; +#endif /* TSFLEXHASH_UPDATE_SIZE */ +#if TSFLEXHASH_REMOVE_RETURNS_KEY + if (pkey != NULL) { + *pkey = entry->key; + } +#endif /* TSFLEXHASH_REMOVE_RETURNS_KEY */ +#if TSFLEXHASH_COPY_VAL + alt = TSFLEXHASH_NEXT(entry); + if (__predict_true(alt == NULL)) { + TSFLEXHASH_MARK_INVALID(entry); +# if TSFLEXHASH_USE_ERASE + TSFLEXHASH_ZERO_ENTRY(entry); + return; +# else + return TSFLEXHASH_RETURN_VALUE(entry); +# endif /* TSFLEXHASH_USE_ERASE */ + } +# if !TSFLEXHASH_USE_ERASE + TSFLEXHASH_RETURN_TYPE ret = TSFLEXHASH_RETURN_VALUE(entry); +# endif /* TSFLEXHASH_USE_ERASE */ + entry->next = alt->next; + entry->key = alt->key; + entry->val.obj = alt->val.obj; + alt->next = hashtable->pool; + hashtable->pool = alt; +# if TSFLEXHASH_USE_ERASE + return; +# else + return ret; +# endif /* TSFLEXHASH_USE_ERASE */ +#else + TSFLEXHASH_MARK_INVALID(entry); +# if TSFLEXHASH_USE_ERASE + TSFLEXHASH_ZERO_ENTRY(entry); + return; +# else + return TSFLEXHASH_RETURN_VALUE(entry); +# endif /* TSFLEXHASH_USE_ERASE */ +#endif /* TSFLEXHASH_COPY_VAL */ + } + // keys didn't match (collision). follow the chain, just as with get() + for (;;) { + alt = entry; + entry = TSFLEXHASH_NEXT(entry); + if (__predict_false(entry == NULL)) { +#if TSFLEXHASH_USE_ERASE + return; +#else + return TSFLEXHASH_EMPTY_RETURN_VAL; +#endif /* TSFLEXHASH_USE_ERASE */ + } + if (__predict_true(TSFLEXHASH_KEY_EQUALS(entry->key, key))) { + if (! TSFLEXHASH_IS_VALID(alt)) { + TSFLEXHASH_MARK_INVALID(entry); + } + alt->next = entry->next; + entry->next = hashtable->pool; + hashtable->pool = entry; +#if TSFLEXHASH_UPDATE_SIZE + hashtable->size--; +#endif /* TSFLEXHASH_UPDATE_SIZE */ +#if TSFLEXHASH_REMOVE_RETURNS_KEY + if (pkey != NULL) { + *pkey = entry->key; + } +#endif /* TSFLEXHASH_REMOVE_RETURNS_KEY */ +#if TSFLEXHASH_USE_ERASE + TSFLEXHASH_ZERO_ENTRY(entry); + return; +#else + return TSFLEXHASH_RETURN_VALUE(entry); +#endif /* TSFLEXHASH_USE_ERASE */ + } + } +} + +static inline __attribute__((__always_inline__)) void +TSFLEXHASH_MANGLE(clear)(TSFLEXHASH_T *hashtable) +{ + TSFLEXHASH_ENTRY_T *entry, *a, *b; + const uint32_t mask = TSFLEXHASH_MASK(hashtable); + // pathscale barfs on a normal for() loop. but this seems to work... + uint32_t idx = 0; + while (idx <= mask) { + entry = &hashtable->entries[idx]; + a = TSFLEXHASH_NEXT(entry); + while (a != NULL) { + b = TSFLEXHASH_NEXT(a); + a->next = hashtable->pool; + hashtable->pool = a; + a = b; + } + entry->next = NULL; + idx++; + } +#if TSFLEXHASH_COUNT_COLLISION + hashtable->collision_count = 0; + hashtable->chain_link_traversal_count = 0; +#endif /* TSFLEXHASH_COUNT_COLLISION */ +#if TSFLEXHASH_UPDATE_SIZE + hashtable->size = 0; +#endif /* TSFLEXHASH_UPDATE_SIZE */ +} + +#define TSFLEXHASH_ITERATOR_T TSFLEXHASH_MANGLE(iterator_t) + +typedef struct { + TSFLEXHASH_ENTRY_T *entry; + uint32_t idx; + TSFLEXHASH_ENTRY_T dummy_entry; +} TSFLEXHASH_ITERATOR_T; + +static inline __attribute__((__always_inline__)) void +TSFLEXHASH_MANGLE(iterator_init)(TSFLEXHASH_T *hashtable, + TSFLEXHASH_ITERATOR_T *i) +{ + __USE(hashtable); + i->entry = &i->dummy_entry; + i->idx = -1; + i->dummy_entry.next = NULL; +} + +static inline __attribute__((__always_inline__)) TSFLEXHASH_RETURN_TYPE +TSFLEXHASH_MANGLE(iterator_next)(TSFLEXHASH_T *hashtable, + TSFLEXHASH_ITERATOR_T *i, TSFLEXHASH_KEY_TYPE *key) +{ + TSFLEXHASH_ENTRY_T *entry = i->entry; + for (;;) { + /* coverity[deref_ptr] */ + entry = TSFLEXHASH_NEXT(entry); + /* coverity[check_after_deref] */ + if (entry != NULL) { + break; + } + if (__predict_false(i->idx == TSFLEXHASH_MASK(hashtable))) { + return TSFLEXHASH_EMPTY_RETURN_VAL; + } + (i->idx)++; + // pathscale has some weird bug that causes entry to get set to a + // crazy value without this little disruption. +#ifdef __PATHSCALE__ + compiler_fence(); +#endif /* __PATHSCALE__ */ + entry = &hashtable->entries[i->idx]; + /* coverity[deref_ptr] */ + if (TSFLEXHASH_IS_VALID(entry)) { + break; + } + } + i->entry = entry; + if (key != NULL) { + *key = entry->key; + } + return TSFLEXHASH_RETURN_VALUE(entry); +} + +#undef TSFLEXHASH_NAME +#undef TSFLEXHASH_STRING_KEYS +#undef TSFLEXHASH_CONST_STRING_KEYS +#undef TSFLEXHASH_KEY_CONST +#undef TSFLEXHASH_GET_KEY_QUALIFIER +#undef TSFLEXHASH_KEY_TYPE +#undef TSFLEXHASH_KEY_EQUALS +#undef TSFLEXHASH_KEY_HASH +#undef TSFLEXHASH_KEY_HASH_EXT +#undef TSFLEXHASH_REMOVE_RETURNS_KEY +#undef TSFLEXHASH_USE_ERASE +#undef TSFLEXHASH_VAL_TYPE +#undef TSFLEXHASH_VAL_SIZE +#undef TSFLEXHASH_CAPACITY +#undef TSFLEXHASH_MASK +#undef TSFLEXHASH_COPY_VAL +#undef TSFLEXHASH_EMPTY_RETURN_VAL +#undef TSFLEXHASH_COUNT_COLLISION +#undef TSFLEXHASH_UPDATE_SIZE +#undef TSFLEXHASH_MALLOC +#undef TSFLEXHASH_FREE +#undef TSFLEXHASH_NUMA + +#undef TSFLEXHASH_NAME_LITERAL1 +#undef TSFLEXHASH_NAME_LITERAL0 +#undef TSFLEXHASH_NAME_LITERAL + +#undef TSFLEXHASH_MANGLE1 +#undef TSFLEXHASH_MANGLE0 +#undef TSFLEXHASH_MANGLE + +#undef TSFLEXHASH_ENTRY +#undef TSFLEXHASH_ENTRY_T +#undef TSFLEXHASH_T + +#undef TSFLEXHASH_RETURN_VALUE +#undef TSFLEXHASH_RETURN_TYPE + +#undef TSFLEXHASH_NEXT +#undef TSFLEXHASH_MARK_VALID +#undef TSFLEXHASH_MARK_INVALID +#undef TSFLEXHASH_ITERATOR_T + +#undef TSFLEXHASH_AUTO_REHASH + +__END_DECLS diff --git a/src/tsflexhash_private.h b/src/tsflexhash_private.h new file mode 100644 index 0000000..e59d055 --- /dev/null +++ b/src/tsflexhash_private.h @@ -0,0 +1,50 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Private functions used by tsflexhash.h. Users of the tslfexhash library + * should not call these functions directly. + */ +#ifndef _TSFLEXHASH_PRIVATE_H_ +#define _TSFLEXHASH_PRIVATE_H_ + +#include +#include + +__BEGIN_DECLS + +#define TSFLEXHASH_SENTINEL ((void *) ~ ((uintptr_t) 0)) + +void *tsflexhash_create_norehash(size_t, size_t, uint32_t, const char *, + void *(*)(size_t), void (*)(void *)); + +void *tsflexhash_create_rehash(size_t, size_t, uint32_t, const char *, + void *(*)(size_t), void (*)(void *)); + +void tsflexhash_destroy_norehash(void *); + +void tsflexhash_destroy_rehash(void *); + +void *tsflexhash_replenish_pool_norehash(void *, bool); + +void *tsflexhash_replenish_pool_rehash(void *, bool); + +void *tsflexhash_malloc(size_t); + +void tsflexhash_free(void *); + +__END_DECLS + +#endif diff --git a/src/tsgosmacs.h b/src/tsgosmacs.h new file mode 100644 index 0000000..5367b17 --- /dev/null +++ b/src/tsgosmacs.h @@ -0,0 +1,45 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TS_GOSMACS_H +#define _TS_GOSMACS_H + +__BEGIN_DECLS + +/* + * Gosling's Emacs algorithm + * Leaves STL one in the dust in speed and quality. + */ +static inline __attribute__((__always_inline__)) +size_t gosmacs_hash(const char *s) +{ + size_t h = 0; + while (*s) + h = (h << 5) - h + (unsigned char) *s++; + return h; +} +static inline __attribute__((__always_inline__)) +size_t gosmacsn_hash(const char *s, size_t n) +{ + size_t h = 0; + for (size_t i = 0; i < n; ++i) + h = (h << 5) - h + s[i]; + return h; +} + +__END_DECLS + +#endif diff --git a/src/tslock.h b/src/tslock.h new file mode 100644 index 0000000..5c951d9 --- /dev/null +++ b/src/tslock.h @@ -0,0 +1,387 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _TSLOCK_H_ +#define _TSLOCK_H_ + +/** \file + * Ticket based spin locks for C threads. + * + * Implements a fast, fair spinlock for threads using the + * ticket based algorithm. + * + * Usually, you should just use tslock_alloc() and tslock_destroy() + * to create and destroy lock objects, since those functions guarantee + * that the cache lines are optimized. + * + * If you really want to allocate a lock in BSS (global variable), use + * the TSLOCK_DECLARE_STATIC macro. + * + * If you really want to embed a lock in your own struct, use the + * extremely_dangerous_internal_tslock_s. If you don't understand how, + * don't do it. Inlining WILL NOT improve performance on any machines + * in use at Two Sigma, since our machines have branch and data predictors + * that make the extra pointer indirection free. + */ +#include +#include +#include +#include +#if __GNUC__ >= 5 +#include +#include +#include + +#if ATOMIC_INT_LOCK_FREE != 2 +#error your architecture does not support lock-free ints +#endif + +#else // __GNUC__ +#include "atomic.h" +#endif // __GNUC__ + +#include +#include "tslog.h" + +//when this is undefined, tslock_t will become opaque, safer, and faster +#define TSLOCK_COMPAT 1 + +//If you redefine this, it really needs to be a power of 2. +//If it's not a power of two, BAD THINGS WILL HAPPEN +#define TSLOCK_ALIGN_SIZE 64 + +#ifdef TSLOCK_COMPAT +//minimum alignment for correctness +#define TSLOCK_ALIGN_CHECK_SIZE 8 +#else +#define TSLOCK_ALIGN_CHECK_SIZE TSLOCK_ALIGN_SIZE +#endif + +struct extremely_dangerous_internal_tslock_s +{ +#if __GNUC__ >= 5 + alignas(TSLOCK_ALIGN_SIZE) atomic_uint next_in_line; + alignas(TSLOCK_ALIGN_SIZE) atomic_uint now_serving; +#else + volatile uint32_t next_in_line __attribute__((aligned(TSLOCK_ALIGN_SIZE))); + volatile uint32_t now_serving __attribute__((aligned(TSLOCK_ALIGN_SIZE))); +#endif + pthread_t holder; +}; + +static_assert(sizeof(struct extremely_dangerous_internal_tslock_s) >= 128, "Ensure alignment attributes worked"); + +#ifndef TSLOCK_COMPAT +typedef struct _donteventhinkaboutit tslock_t; +#else +typedef struct extremely_dangerous_internal_tslock_s tslock_t; +#endif + +#define TSLOCK_DECLARE_STATIC(name) \ + static struct extremely_dangerous_internal_tslock_s __attribute__((aligned(TSLOCK_ALIGN_SIZE))) name##_underlying; \ + static tslock_t * const name = __STATIC_CAST(tslock_t *, __STATIC_CAST(uintptr_t, &name##_underlying)); + +/* + * The lock returns by tslock_alloc can be freed with tslock_destroy() + */ +static tslock_t * __attribute__((used)) +tslock_alloc(void) +{ + void * lock = NULL; + + const size_t lock_sz = + sizeof(struct extremely_dangerous_internal_tslock_s); + + int rc = posix_memalign( + &lock, + TSLOCK_ALIGN_SIZE, + lock_sz + ); + if (rc != 0) { + return NULL; + } + +#if __GNUC__ >= 5 + struct extremely_dangerous_internal_tslock_s* lock_internal = __STATIC_CAST(struct extremely_dangerous_internal_tslock_s*, lock); + atomic_init(&lock_internal->next_in_line, 0); + atomic_init(&lock_internal->now_serving, 0); + lock_internal->holder = 0; +#else + memset(lock, 0, lock_sz); +#endif + + return __STATIC_CAST(tslock_t *, lock); +} + +static int tslock_complained_loudly; + +static inline void +tslock_alignment_check( + void * lock, + const char * const file, + const int line +) +{ + if (unlikely(((uintptr_t) lock & (TSLOCK_ALIGN_CHECK_SIZE-1)))) { + TSABORTX( + "%s:%d: tslock_t is not properly aligned. " + "Please use tslock_alloc() or alignas(TSLOCK_ALIGN_SIZE) or " + "else your performance could greatly suffer. " + "Also, the way this lock is currently being used doesn't " + "guarantee that it actually behaves like a lock.", + file, line); + } + +#ifdef TSLOCK_COMPAT + if (!tslock_complained_loudly && unlikely(((uintptr_t) lock & (TSLOCK_ALIGN_SIZE-1)))) { + TSLOGX(TSERROR, + "%s:%d: You are not using the new tslock_t api. " + "Please use tslock_alloc() or alignas(TSLOCK_ALIGN_SIZE) or " + "else your performance could greatly suffer. " + "The old api will be deprecated soon.", + file, line); + tslock_complained_loudly = 1; + } +#endif +} + + +static inline void +_tslock( + tslock_t * lock_ptr, + const char * const file, + const int line +) +{ + struct extremely_dangerous_internal_tslock_s * lock = + __PUNNED_CAST(struct extremely_dangerous_internal_tslock_s *, lock_ptr); + + tslock_alignment_check(lock, file, line); + + const pthread_t self = pthread_self(); + +#if __GNUC__ >= 5 + const uint32_t ticket = atomic_fetch_add(&lock->next_in_line, 1); +#else + const uint32_t ticket = atomic_inc_32_nv(&lock->next_in_line) - 1; +#endif + // Fast path for uncontested, unlocked case +#if __GNUC__ >= 5 + if (atomic_load(&lock->now_serving) == ticket) +#else + if (lock->now_serving == ticket) +#endif + { + lock->holder = self; + return; + } + + // Check to see if the lock holder is self, in which case deadlock + // is imminent. + if (lock->holder == self) + TSLOGX(TSWARN, "%s:%d: Possible deadlock: lock is held by self", + file, line); + + // We are locked, spin until we are ready + uint64_t spin_count = 0; +#if __GNUC__ >= 5 + while (atomic_load(&lock->now_serving) != ticket) +#else + while (lock->now_serving != ticket) +#endif + { +#ifdef __PATHSCALE__ + // psc generates an infinite loop for this for some reason + compiler_fence(); +#else + _mm_pause(); +#endif + if ((++spin_count & 0xFFFFFFF) == 0) + TSLOGX(TSWARN, + "%s:%d: Possible deadlock" + " spin_count=%"PRIu64 + " ticket=%"PRIx32 + " next_ticket=%"PRIx32 + " now_serving=%"PRIx32, + file, + line, + spin_count, + ticket, +#if __GNUC__ >= 5 + atomic_load(&lock->next_in_line), + atomic_load(&lock->now_serving) +#else + lock->next_in_line, + lock->now_serving +#endif + ); + } + + lock->holder = self; +} + +#define tslock(lock) _tslock(lock, __FILE__, __LINE__) + + +static inline void +_tsunlock( + tslock_t * lock_ptr, + const char * const file, + const int line +) +{ + struct extremely_dangerous_internal_tslock_s * lock = + __PUNNED_CAST(struct extremely_dangerous_internal_tslock_s *, lock_ptr); + + if (unlikely(!lock->holder)) + { + // This will not detect all double unlocks, but hopefully + // some of them will be found and fixed through this warning + TSLOGX(TSERROR, "%s:%d: Double unlock detected", file, line); + return; + } + + lock->holder = 0; +#if __GNUC__ >= 5 + atomic_fetch_add(&lock->now_serving, 1); +#else + lock->now_serving++; +#endif +} + +#define tsunlock(lock) \ + _tsunlock((lock), __FILE__, __LINE__) + +#define tstrylock(lock) \ + _tstrylock((lock), __FILE__, __LINE__) + +static inline int +_tstrylock( + tslock_t * lock_ptr, + const char * const file, + const int line +) +{ + struct extremely_dangerous_internal_tslock_s * lock = + __PUNNED_CAST(struct extremely_dangerous_internal_tslock_s *, lock_ptr); + + tslock_alignment_check(lock, file, line); + +#if __GNUC__ >= 5 + const uint32_t ticket = atomic_load(&lock->next_in_line); + + // If it is already locked, we can't win + if (atomic_load(&lock->now_serving) != ticket) + return 0; + + // If it hasn't changed, then we might be able to get it + // if the same ticket is still there + uint32_t existing_ticket = ticket; + if (atomic_compare_exchange_strong(&lock->next_in_line, &existing_ticket, ticket+1) == false) + return 0; +#else + const uint32_t ticket = lock->next_in_line; + + // If it is already locked, we can't win + if (lock->now_serving != ticket) + return 0; + + // If it hasn't changed, then we might be able to get it + // if the same ticket is still there + if (atomic_cas_32(&lock->next_in_line, ticket, ticket+1) != ticket) + return 0; +#endif + + // We have it. Record our ownership + lock->holder = pthread_self(); + return 1; +} + + +/* Check if the lock is currently held. + * + * The values must be read into locals to avoid undefined behaviour + * as to which volatile will be read first. + */ +static inline int +tsislocked( + tslock_t * lock_ptr +) +{ + struct extremely_dangerous_internal_tslock_s * lock = + __PUNNED_CAST(struct extremely_dangerous_internal_tslock_s *, lock_ptr); + +#if __GNUC__ >= 5 + const uint32_t now_serving = atomic_load(&lock->now_serving); + const uint32_t next_in_line = atomic_load(&lock->next_in_line); +#else + const uint32_t now_serving = lock->now_serving; + const uint32_t next_in_line = lock->next_in_line; +#endif + + return now_serving != next_in_line; +} + + + +/* Check to see if any other threads are waiting on the lock. + * + * Must be called with the lock held, otherwise the now_serving + * value might change unexpectedly. The values must be read into + * locals to avoid undefined behaviour as to which volatile will + * be read first. + */ +static inline int +tslock_contended( + const tslock_t * const lock_ptr +) +{ + struct extremely_dangerous_internal_tslock_s * lock = + __PUNNED_CAST(struct extremely_dangerous_internal_tslock_s *, lock_ptr); + +#if __GNUC__ >= 5 + const uint32_t now_serving = atomic_load(&lock->now_serving); + const uint32_t next_in_line = atomic_load(&lock->next_in_line); +#else + const uint32_t now_serving = lock->now_serving; + const uint32_t next_in_line = lock->next_in_line; +#endif + + return now_serving + 1 != next_in_line; +} + +static void __attribute__((used)) tslock_destroy( + tslock_t * lock_ptr +) +{ + if (!lock_ptr) + return; + + struct extremely_dangerous_internal_tslock_s * lock = + __PUNNED_CAST(struct extremely_dangerous_internal_tslock_s *, lock_ptr); + + int okay = !tsislocked(lock_ptr) || lock->holder == pthread_self(); + + if (!okay) { + TSLOGX(TSERROR, + "You freed a lock while it was held by someone else. " + "Expect breakage."); + } + + free(lock_ptr); +} + +#endif diff --git a/src/tslog.c b/src/tslog.c new file mode 100644 index 0000000..2190d95 --- /dev/null +++ b/src/tslog.c @@ -0,0 +1,592 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "twosigma.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tsdir.h" +#include "tslog.h" + +static int logfd = STDERR_FILENO; +static FILE *logfp = NULL; + +// YYYYMMDD-HHMMSS\0 = 15 characters +#define TIMESTAMP_SIZE 16 + +int tsdebug = 0; +int tsabort = TSFATAL; +int tslevel = TSINFO; + +static const struct flock lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + .l_pid = 0, +}; + + +typedef struct +{ + tslog_hookup_t handler; + void * priv; +} tslog_hookup_entry_t; + +static tslog_hookup_entry_t * hookup_entries; +static size_t hookup_count; + + +// write() has to be locked because calling it from multiple threads on same fp +// will result in lost/corrupt log entries. +static pthread_mutex_t write_mutex = PTHREAD_MUTEX_INITIALIZER; + +static size_t +tslog_strftime(char *buf, size_t len, const char *fmt, bool local) +{ + struct tm tm; + time_t t = time(NULL); + if (local) + (void)localtime_r(&t, &tm); + else + (void)gmtime_r(&t, &tm); + return strftime(buf, len, fmt, &tm); +} + + +int +tslogopen(const char *fname, int flag) +{ + char buf[MAXPATHLEN]; + + (void)tslog_strftime(buf, sizeof(buf), fname, false); + int fd = open(buf, O_CREAT|flag|O_RDWR, 0664); + if (fd == -1) { + warn("Cannot open `%s'", buf); + return -1; + } + if (fcntl(fd, F_SETLK, &lock) == -1) { + struct flock l; + int oerrno = errno; + l = lock; + if (fcntl(fd, F_GETLK, &l) == -1) + warn("Can't get lock information"); + errno = oerrno; + warn("Cannot obtain exclusive lock on `%s' (locked by %d)", fname, + l.l_pid); + (void)close(fd); + return -1; + } + FILE *fp; + if ((fp = fdopen(fd, "w+")) == NULL) { + warn("Cannot fdopen `%d'", fd); + (void)close(fd); + return -1; + } + logfp = fp; + return logfd = fd; +} + +int +tslogclose(void) +{ + int fd = logfd; + FILE *fp = logfp; + + logfd = STDERR_FILENO; + logfp = stderr; + + if (fp != stderr) + (void)fclose(fp); + + if (fd != STDERR_FILENO) + (void)close(fd); + + return 0; +} + +FILE * +tslogfp(void) +{ + if (logfp == NULL) + logfp = stderr; + return logfp; +} + + +void +tslog_info( + const tslog_info_t * info, + const char *fmt, + ... +) +{ + va_list ap; + va_start(ap, fmt); + tslog_info_vargs(info, fmt, ap); + va_end(ap); +} + + +void +tslog_info_vargs( + const tslog_info_t * info, + const char * fmt, + va_list ap +) +{ + const int serrno = errno; + + char buf[10240]; + size_t offset = 0; + + // In backwards compatability mode, the line# will be 0 + // and the file will contain the fully formatted prefix string. + const char * logstr; + if (info->line == 0) + logstr = info->file; + else + logstr = getnewlogstr( + info->file, + info->line, + info->func, + info->level + ); + + const size_t hdr_len = strlen(logstr); + memcpy(buf, logstr, hdr_len); + offset += hdr_len; + + int msg_len = vsnprintf( + buf + offset, + sizeof(buf) - offset, + fmt, + ap + ); + + if (msg_len < 0) + { + static const char failure_msg[] = "tslog vsnprintf failed!"; + memcpy(buf + offset, failure_msg, sizeof(failure_msg)); + offset += sizeof(failure_msg); + } else + if (msg_len + offset > sizeof(buf)) + offset = sizeof(buf); + else + offset += msg_len; + + if (info->do_perror) + { + if (offset != sizeof(buf)) + buf[offset++] = ':'; + + int perror_len = snprintf( + buf + offset, + sizeof(buf) - offset, + " (%s)", + strerror(serrno) + ); + if (perror_len < 0) + { + static const char failure_msg[] = "strerr failed!"; + memcpy(buf + offset, failure_msg, sizeof(failure_msg)); + offset += sizeof(failure_msg); + } else + if (perror_len + offset > sizeof(buf)) + offset = sizeof(buf); + else + offset += perror_len; + } + + // Add a newline if we have space + if (offset != sizeof(buf)) + buf[offset++] = '\n'; + + pthread_mutex_lock(&write_mutex); + ssize_t wlen = write(logfd, buf, offset); + pthread_mutex_unlock(&write_mutex); + + if (wlen != (ssize_t) offset) + warn("Cannot append to log"); + + for (unsigned i = 0 ; i < hookup_count ; i++) + { + tslog_hookup_entry_t * const entry = &hookup_entries[i]; + + entry->handler( + entry->priv, + info, + buf + hdr_len, // skip the prefix + offset - hdr_len + ); + } + + // Restore errno so that the caller does not see a change + errno = serrno; +} + + +static const struct { + const char *n; + int v; +} nv[] = { + { "DIAG", TSDIAG }, + { "DEBUG", TSDEBUG }, + { "INFO", TSINFO }, + { "WARN", TSWARN }, + { "ERROR", TSERROR }, + { "FATAL", TSFATAL }, + { "COMPAT", TSCOMPAT } +}; + +const char * +tsloggetlevel(int level) +{ + size_t i; + for (i = 0; i < __arraycount(nv); i++) + if (nv[i].v == level) + return nv[i].n; + return nv[i - 1].n; +} + +int +tslogsetlevel(const char *l, int *level) +{ + for (size_t i = 0; i < __arraycount(nv); i++) { + if (strcasecmp(l, nv[i].n) == 0) { + *level = nv[i].v; + return 0; + } + } + TSLOGX(TSERROR, "Unknown level %s", l); + return -1; +} + +int +tslogsetabort(const char *l) +{ + for (size_t i = 0; i < __arraycount(nv); i++) + if (strcasecmp(l, nv[i].n) == 0) { + tsabort = nv[i].v; + return 0; + } + TSLOGX(TSERROR, "Unknown level %s", l); + return -1; +} + +int tslog_mklogdir(const char *program, const char *root_dir, + char *logdir, size_t logdir_size, + char *logfile, size_t logfile_size) +{ + const char *suffix; + if (root_dir != NULL) { + // No suffix needed + suffix = ""; + } + else if ((root_dir = getenv("HOME")) == NULL) { + warnx("No root directory specified " + "and unable to obtain the HOME environment " + "variable"); + return -1; + } + else { + // Path is $HOME/ts/log + suffix = "/ts/log"; + } + + char timestamp[TIMESTAMP_SIZE]; + size_t timestamp_result = tslog_strftime(timestamp, sizeof(timestamp), + "%Y%m%d-%H%M%S", true); + + if (timestamp_result == 0) { + warnx("Unable to construct the timestamp portion " + "of the log directory"); + return -1; + } + program = basename(program); + int result = snprintf(logdir, logdir_size, "%s%s/%s-%s.log", root_dir, + suffix, program, timestamp); + // + // Check that sprintf worked and then check to make sure that the + // directory doesn't already exist + // + if (result < 0) { + warn("Unable to snprintf the log directory"); + return -1; + } + else if (result >= (int)logdir_size) { + warnx("Needed %d bytes for the log directory name but only " + "%zu were provided", result, logdir_size); + return -1; + } + else if (tsdir_exists(logdir)) { + warnx("Directory '%s' already exists", logdir); + return -1; + } + + // + // Try to make the directory + // + if (tsdir_mkdir(logdir, 0775) != 0) { + warnx("Unable to make directory '%s'", logdir); + return -1; + } + + // + // Create the file name + // + result = snprintf(logfile, logfile_size, "%s/messages", logdir); + if (result < 0) { + warn("Unable to snprintf the log file"); + return -1; + } + else if (result >= (int)logfile_size) { + warnx("Needed %d bytes for the log directory name but only " + "%zu were provided", result, logfile_size); + } + return tslogopen(logfile, 0); +} + + +int +tslog_set_hookup( + tslog_hookup_t handler, + void * priv +) +{ + tslog_hookup_entry_t * const new_entries = realloc( + hookup_entries, + (hookup_count + 1) * sizeof(*new_entries) + ); + if (!new_entries) + return -1; + + hookup_entries = new_entries; + + hookup_entries[hookup_count++] = (tslog_hookup_entry_t) { + .handler = handler, + .priv = priv, + }; + + return 0; +} + + +int +tslog_unset_hookup( + tslog_hookup_t handler, + void * priv +) +{ + size_t i; + for (i = 0; i < hookup_count; i++) { + tslog_hookup_entry_t * entry = &hookup_entries[i]; + if (entry->handler == handler && entry->priv == priv) + break; + } + + if (i == hookup_count) + return -1; + + for (; i < hookup_count - 1; i++) + hookup_entries[i] = hookup_entries[i + 1]; + hookup_count--; + + return 0; +} + + +/** Deprecated functions for libraries that have not updated to the new + * version of base. + */ +void +tslog( + const char * pref, + const char * fmt, + ... +) __attribute__((__format__(__printf__, 2, 3),__deprecated__)); + + +void +tslogl( + int level, + const char * pref, + const char * fmt, + ... +) __attribute__((__format__(__printf__, 3, 4),__deprecated__)); + + +void +tslogx( + const char * pref, + const char * fmt, + ... +) __attribute__((__format__(__printf__, 2, 3),__deprecated__)); + + +void +tsloglx( + int level, + const char * pref, + const char * fmt, + ... +) __attribute__((__format__(__printf__, 3, 4),__deprecated__)); + + +void +tslog( + const char * pref, + const char * fmt, + ... +) +{ + tslog_info_t info = { + .file = pref, + .func = __func__, + .line = 0, + .level = TSCOMPAT, + .do_perror = 1, + }; + + va_list ap; + va_start(ap, fmt); + tslog_info_vargs(&info, fmt, ap); + va_end(ap); +} + + +void +tslogl( + int level, + const char * pref, + const char * fmt, + ... +) +{ + tslog_info_t info = { + .file = pref, + .func = __func__, + .line = 0, + .level = level, + .do_perror = 1, + }; + + va_list ap; + va_start(ap, fmt); + tslog_info_vargs(&info, fmt, ap); + va_end(ap); +} + + +void +tslogx( + const char * pref, + const char * fmt, + ... +) +{ + tslog_info_t info = { + .file = pref, + .func = __func__, + .line = 0, + .level = TSCOMPAT, + .do_perror = 0, + }; + + va_list ap; + va_start(ap, fmt); + tslog_info_vargs(&info, fmt, ap); + va_end(ap); +} + + +void +tsloglx( + int level, + const char * pref, + const char * fmt, + ... +) +{ + tslog_info_t info = { + .file = pref, + .func = __func__, + .line = 0, + .level = level, + .do_perror = 0, + }; + + va_list ap; + va_start(ap, fmt); + tslog_info_vargs(&info, fmt, ap); + va_end(ap); +} + + + +void +tshdump_info( + const tslog_info_t * info, + const void * const buf, + const size_t len +) +{ + const int serrno = errno; + + const unsigned bytes = _TSHDUMP_BYTES_PER_LINE; + char hex[bytes*3 + 1]; + char txt[bytes + 1]; + memset(hex, 0, sizeof(hex)); + memset(txt, 0, sizeof(txt)); + + const uint8_t * const bp = buf; + size_t offset = 0; + size_t i = 0; + + while (offset < len) + { + const uint8_t c = bp[offset++]; + snprintf(hex + i*3, 4, "%02x ", c); + snprintf(txt + i, 2, "%c", isprint(c) ? c : '.'); + + if ((++i % bytes) != 0 && offset != len) + continue; + + tslog_info(info, "%04zx: %-*s%s", + offset - i, + (int) sizeof(hex), + hex, + txt + ); + + memset(hex, 0, sizeof(hex)); + memset(txt, 0, sizeof(txt)); + i = 0; + } + + errno = serrno; +} diff --git a/src/tslog.h b/src/tslog.h new file mode 100644 index 0000000..d0a9444 --- /dev/null +++ b/src/tslog.h @@ -0,0 +1,287 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef TSLOG_H +#define TSLOG_H + +#include +#include +#include +#include +#include +#include + +__BEGIN_DECLS + +typedef struct { + const char * file; + int line; + const char * func; + int level; + int do_perror; +} tslog_info_t; + +/** + * Creates a log directory located below the specified root directory + * that has the pattern "program-timestamp.log" and creates a log file + * names "messages" in that directory which will be used for tslog + * messages. If the root_dir is NULL, then the HOME environment + * variable will be used and "/ts/log" will be appended to it. The + * timestamp portion of it will be in the format YYYYMMDD-HHMMSS. + * + * @param program the name of the program that is calling this + * function. This should be the value of argv[0] and cannot be NULL + * @param root_dir the directory under which the log directory should + * be created. If NULL, $HOME/ts/log will be used + * @param logdir the resulting log directory. + * @param logdir_size the length of the logdir argument + * @param logfile the resulting messages file. + * @param logfile_size the length of the logfile_size argument + * @return the file descriptor of the created logfile, or -1 for + * failure + */ +int +tslog_mklogdir(const char *program, const char *root_dir, + char *logdir, size_t logdir_size, + char *logfile, size_t logfile_size); + + +typedef void (*tslog_hookup_t)( + void *cookie, + const tslog_info_t * info, + const char *message, + size_t message_len +); + +/** + * Set up a hook up function on log events + * @param hookup the hook up function + * @param cookie cookie passed to the function call + */ +int +tslog_set_hookup( + tslog_hookup_t hookup, + void * cookie +); + +/** + * Uninstalls a hook up function + * + * @return 0 on success, non-zero otherwise + */ +int +tslog_unset_hookup( + tslog_hookup_t handler, + void * priv +); + +const char *getlogstr(const char *, int, const char *, int); +const char *getnewlogstr(const char *, int, const char *, int); +int setlogstr(const char *); + +/** + * Logging to a file + * + * param #1: filename + * param #2: open flags + */ +int tslogopen(const char *, int); +int tslogclose(void); +const char *tsloggetlevel(int); +int tslogsetlevel(const char *, int *); +int tslogsetabort(const char *); +FILE *tslogfp(void); + +/** + * Logging to a file [the x version does not append (%s), strerror(errno) + * + * param #1: prefix [NULL for none] + * param #2: format + * param #3: ... + */ +void +tslog_info( + const tslog_info_t *, + const char *fmt, + ... +) __attribute__((__format__(__printf__, 2, 3))); + +void +tslog_info_vargs( + const tslog_info_t *, + const char *fmt, + va_list args +); + + +/** + * Hexdump to the logfile. + */ +void +tshdump_info( + const tslog_info_t *, + const void * buf, + size_t len +); + +extern int tsdebug; +extern int tsabort; +extern int tslevel; + +#define PREF getlogstr(__FILE__, __LINE__, __func__, TSCOMPAT) + +#define TSDIAG 5 +#define TSDEBUG 4 +#define TSINFO 3 +#define TSWARN 2 +#define TSERROR 1 +#define TSFATAL 0 +#define TSCOMPAT -1 + +#ifndef TSLOG_DIAG_ENABLED +# if defined(DIAGNOSTIC) +# define TSLOG_DIAG_ENABLED true +# else +# define TSLOG_DIAG_ENABLED false +# endif +#endif + +#define TS_IS_LOGGING(level) (((level) != TSDIAG || TSLOG_DIAG_ENABLED) && level <= tslevel) + +#define _TSLOG(level, do_perror, fmt, ...) \ + do { \ + if ((level) != TSDIAG || TSLOG_DIAG_ENABLED) { \ + if ((level) > tslevel && (level) != tsabort) \ + break; \ + tslog_info_t __tslog_info = { \ + __FILE__, \ + __LINE__, \ + __func__, /* NOLINT */ \ + (level), \ + (do_perror) \ + }; \ + tslog_info(&__tslog_info, fmt, ## __VA_ARGS__); /* NOLINT */ \ + if ((level) <= tsabort) \ + abort(); \ + } \ + } while (/*CONSTCOND*/false) + +#define TSLOG(level, fmt, ...) _TSLOG(level, 1, fmt, ## __VA_ARGS__) +#define TSLOGX(level, fmt, ...) _TSLOG(level, 0, fmt, ## __VA_ARGS__) + +/* Force an abort */ +#define TSABORT(fmt, ...) \ + do { \ + TSLOG(TSFATAL, fmt, ## __VA_ARGS__); \ + abort(); \ + } while (false) + +#define TSABORTX(fmt, ...) \ + do { \ + TSLOGX(TSFATAL, fmt, ## __VA_ARGS__); \ + abort(); \ + } while (false) + + +/* hexdump to log */ +#define _TSHDUMP(level, do_perror, buffer, size) \ + do { \ + if ((level) == TSDIAG && !TSLOG_DIAG_ENABLED) \ + break; \ + if ((level) > tslevel \ + && (level) != tsabort) \ + break; \ + tslog_info_t __tslog_info = { \ + __FILE__, \ + __LINE__, \ + __func__, /* NOLINT */ \ + (level), \ + (0) /* no perror */ \ + }; \ + tshdump_info(&__tslog_info, (buffer), (size)); /* NOLINT */ \ + if ((level) <= tsabort) \ + abort(); \ + } while (/*CONSTCOND*/false) + +#define TSHDUMP(level, buf, size) _TSHDUMP(level, 1, buf, size) + +/* + * %%: % + * %D: Datetime+Zone + * %F: File + * %L: Line + * %N: Program name + * %P: File with Path + * %f: Function + * %T: Type + * %t: Thread id + */ +#define PREF_LONG "%T %N::%f@%P,%L: %D: " +#define PREF_SHORT "%T %D %F,%L: " + +// this is a new, more readable format for logging. it will be used by the TSLOG +// and TSLOGX functions. PREF_LONG and PREF_SHORT are being left as-is because +// we have existing code that parses logs and expects this format. New code +// should use the new functions and the new format. +#define DEFAULT_TSLOG_FORMAT "%D %T [%t] %N::%f (%F:%L): " + + +/* Counter limited log macros */ +#define _DEFAULT_LOG_LIMIT 10000 +// Duplicate the level checks because that potentially saves us from +// loading the 2 static uint32's, which can results in a huge +// performance penalty especially if cross-bus. +#define _TSLOGN(log_macro, limit, level, ...) \ + do { \ + if ((level) != TSDIAG || TSLOG_DIAG_ENABLED) { \ + if ((level) > tslevel && (level) != tsabort) \ + break; \ + static uint32_t _count = 0; \ + static uint32_t _threshold = (limit); \ + if (++_count < (limit)) \ + log_macro((level), __VA_ARGS__); \ + else if (_count == _threshold) { \ + log_macro((level), __VA_ARGS__); \ + TSLOGX((level), "Previous message has been logged %" PRIu32 \ + " times and is being suppressed", _count); \ + _threshold *= 2; \ + } \ + } \ + } while (/*CONSTCOND*/false) + +#define TSLOGN(limit, ...) _TSLOGN(TSLOG, limit, __VA_ARGS__) +#define TSLOGXN(limit, ...) _TSLOGN(TSLOGX, limit, __VA_ARGS__) +#define TSLOGL(...) TSLOGN(_DEFAULT_LOG_LIMIT, __VA_ARGS__) +#define TSLOGXL(...) TSLOGXN(_DEFAULT_LOG_LIMIT, __VA_ARGS__) + +#define _TSHDUMP_BYTES_PER_LINE 16 +#define _TSHDUMP_LINE_LIMIT 100 +#define _TSHDUMP_SIZE_LIMIT (_TSHDUMP_LINE_LIMIT*_TSHDUMP_BYTES_PER_LINE) +#define TSHDUMPN(limit, level, buffer, size) \ + do { \ + uint32_t _size = (size); \ + if (_size <= _TSHDUMP_SIZE_LIMIT) { \ + _TSLOGN(TSHDUMP, (limit), (level), (buffer), _size); \ + } else { \ + _TSLOGN(TSHDUMP, (limit), (level), (buffer), _TSHDUMP_SIZE_LIMIT); \ + TSLOGXN((limit), (level), "Previous hex dump had a size of %" PRIu32 \ + " bytes and is being truncated", _size); \ + } \ + } while (/*CONSTCOND*/false) +#define TSHDUMPL(...) TSHDUMPN(_DEFAULT_LOG_LIMIT/_TSHDUMP_LINE_LIMIT, __VA_ARGS__) + +__END_DECLS + +#endif diff --git a/src/tstl.c b/src/tstl.c new file mode 100644 index 0000000..f605397 --- /dev/null +++ b/src/tstl.c @@ -0,0 +1,166 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "twosigma.h" + +#include +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tstl.h" + +void * +_tstl_get(tstl_t *tstl) +{ +#ifdef _REENTRANT + int rc; + /* Multi-threaded version */ + if (__predict_false(!tstl->initialized)) { + if ((rc = pthread_mutex_lock(&tstl->mutex))) { + errno = rc; + err(1, "pthread_mutex_lock failed"); + } + + if (!tstl->initialized) { + rc = pthread_key_create( + __UNVOLATILE_T(pthread_key_t *, &tstl->key), + tstl->destroy_function); + + if (rc) { + errno = rc; + err(1, "pthread_key_create failed"); + } + + _mm_mfence(); + + tstl->initialized = 1; + } + + _mm_mfence(); + + if ((rc = pthread_mutex_unlock(&tstl->mutex))) { + errno = rc; + err(1, "pthread_mutex_unlock failed"); + } + } + + void *p = pthread_getspecific(tstl->key); + if (p) + return p; + + p = (*tstl->create_function)(tstl->create_arg); + assert(p != NULL); + if ((rc = pthread_setspecific(tstl->key, p))) { + errno = rc; + err(1, "pthread_setspecific failed"); + } + + return p; +#else + /* Single threaded version */ + if (tstl->p == NULL) { + void *p = (*tstl->create_function)(tstl->create_arg); + assert(p != NULL); + tstl->p = p; + } + tstl->initialized = 1; + return tstl->p; +#endif +} + + +void * +_tstl_buf_create(void *arg) +{ + size_t size; + void *p; + + size = (size_t) arg; + p = calloc(1, size); + if (! p) { + err(1, "calloc(1, %zu) failed", size); + abort(); + } + return p; +} + +typedef struct { + uint32_t count, next; + void (*destroy_function)(void *); + void *pointers[0]; +} _tstl_rr_inner_t; + +void * +_tstl_rr_inner_create(void *create_arg) +{ + size_t count = (size_t) create_arg; + _tstl_rr_inner_t *inner = calloc(1, sizeof(*inner) + (count * sizeof(void *))); + assert(inner != NULL); + inner->count = count; + return inner; +} + +void +_tstl_rr_inner_free(void *arg) +{ + _tstl_rr_inner_t *inner = arg; + uint32_t count = inner->count; + void **pointers = inner->pointers; + void (*destroy_function)(void *) = inner->destroy_function; + if (destroy_function != NULL) { + for (uint32_t i = 0; i < count; i++) { + void *pointer = pointers[i]; + if (pointer != NULL) { + destroy_function(pointer); + } + } + } + free(inner); +} + +void * +tstl_rr_get(tstl_rr_t *rr) +{ + _tstl_rr_inner_t *inner = tstl_get(&rr->tstl); + uint32_t next = inner->next; + void *ret = inner->pointers[next]; + if (__predict_false(ret == NULL)) { + inner->pointers[next] = ret = rr->create_function(rr->create_arg); + assert(ret != NULL); + // hack alert: set inner->destroy_function here since we don't have a pointer to it in _tstl_rr_inner_create() + inner->destroy_function = rr->destroy_function; + } + inner->next = (next + 1) % inner->count; + return ret; +} + +#define TSTL_STRERROR_BUF_SIZE 256 +static tstl_t tstl_strerror_buf = TSTL_BUF_INITIALIZER(TSTL_STRERROR_BUF_SIZE); + +char * +tstl_strerror(int errnum) +{ + char *buf = tstl_get(&tstl_strerror_buf); + assert(buf != NULL); + if (strerror_r(errnum, buf, TSTL_STRERROR_BUF_SIZE) == -1) { + TSLOG(TSWARN, "strerror_r(%d, %p, %d) failed", errnum, buf, TSTL_STRERROR_BUF_SIZE); + snprintf(buf, TSTL_STRERROR_BUF_SIZE, "error #%d", errnum); + } + return buf; +} diff --git a/src/tstl.h b/src/tstl.h new file mode 100644 index 0000000..67b97ee --- /dev/null +++ b/src/tstl.h @@ -0,0 +1,144 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Simplified thread-local storage for pthreads. see testsrc/tstltest.c for an example + */ + +#ifndef _TSTL_H_ +#define _TSTL_H_ + +#include +#ifdef _REENTRANT +#include +#endif + +/* + * if true, the common case in tstl_get will be inlined for better performance. this should + * only be set to 0 for testing purposes. + */ +#define TSTL_INLINE_GET 1 + +__BEGIN_DECLS + +/** + * A thread-local variable. Initialize variables of this type using one of the two macros below. + */ +typedef struct _tstl tstl_t; + +/** + * Use this macro to init a threadlocal of any type (struct, int, string, whatever). Just supply + * a create_function and an optional argument, and optionally provide a destroy_function that will + * be called when the thread dies. + */ +#ifdef _REENTRANT +#define TSTL_MUTEX_INITIALIZER .mutex = PTHREAD_MUTEX_INITIALIZER, +#else +#define TSTL_MUTEX_INITIALIZER .p = NULL, +#endif +#define TSTL_INITIALIZER(cf, df, ca) \ + { \ + .create_function = cf, \ + .destroy_function = df, \ + .create_arg = ca, \ + .initialized = 0, \ + TSTL_MUTEX_INITIALIZER \ + } + +/** + * Use this simpler macro to init a threadlocal for a simple buffer (e.g. char[]). Just supply the + * size, in bytes. + */ +#define TSTL_BUF_INITIALIZER(size) TSTL_INITIALIZER(_tstl_buf_create, free, (void *) size) + +/** + * Get the value of a tstl_t variable. This will cause the create_function to be invoked + * if it is the first time you're getting the value on the current thread. + */ +static inline __attribute__((__always_inline__)) void *tstl_get(tstl_t *); + +/** + * A "round robin" thread-local type. This is like a tstl_t that keeps multiple values for each thread and returns + * them in round-robin order. This is useful for char buffers or other objects where you might want to use several + * instances simulatenously on a given thread. + */ +typedef struct _tstl_rr tstl_rr_t; + +void * tstl_rr_get(tstl_rr_t *rr); + +void *_tstl_rr_inner_create(void *); +void _tstl_rr_inner_free(void *); + +#define TSTL_RR_INITIALIZER(cf, df, ca, count) \ + { \ + .create_function = cf, \ + .destroy_function = df, \ + .create_arg = ca, \ + .tstl = TSTL_INITIALIZER(_tstl_rr_inner_create, _tstl_rr_inner_free, (void *) count) \ + } + +#define TSTL_RR_BUF_INITIALIZER(size, count) TSTL_RR_INITIALIZER(_tstl_buf_create, free, (void *) size, count) + +// no user-servicable parts below this line + +struct _tstl { + void *(*create_function)(void *); + void (*destroy_function)(void *); + void *create_arg; + volatile int initialized; +#ifdef _REENTRANT + volatile pthread_key_t key; + pthread_mutex_t mutex; +#else + void *p; +#endif +}; + +struct _tstl_rr { + void *(*create_function)(void *); + void (*destroy_function)(void *); + void *create_arg; + tstl_t tstl; +}; + +void *_tstl_get(tstl_t *tstl); + +static inline __attribute__((__always_inline__)) void * +tstl_get(tstl_t *tstl) +{ +#if TSTL_INLINE_GET + if (__predict_true(tstl->initialized)) { + compiler_fence(); +#ifdef _REENTRANT + void *p = pthread_getspecific(tstl->key); + if (__predict_true(p != NULL)) { + return p; + } +#else + return tstl->p; +#endif + } +#endif + return _tstl_get(tstl); +} + +void *_tstl_buf_create(void *); + +char *tstl_strerror(int); + +__END_DECLS + +#endif diff --git a/src/twosigma.h b/src/twosigma.h new file mode 100644 index 0000000..b56b251 --- /dev/null +++ b/src/twosigma.h @@ -0,0 +1,185 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _TWOSIGMA_H_ +#define _TWOSIGMA_H_ + +/* We want everything */ +#if defined(__linux__) +# ifndef _LARGEFILE64_SOURCE +# define _LARGEFILE64_SOURCE +# endif +# ifndef _FILE_OFFSET_BITS +# define _FILE_OFFSET_BITS 64 +# endif +# ifndef _GNU_SOURCE +# define _GNU_SOURCE +# include +# endif /* _GNU_SOURCE */ + +#define TSEXPORT __attribute__((visibility ("default"))) + +/* + * Fixup string.h: We want _GNU_SOURCE for the extensions, but that exposes + * the bad glibc strerror_r() which is not standards compliant + */ +/* But not the bad strerror! */ +#define strerror_r __gnu_strerror_r +#include +#undef strerror_r +/* We don't do the REDIRECT_NTH here because we might not be included first */ +# ifdef __cplusplus +extern "C" +# else +extern +# endif +int __xpg_strerror_r (int __errnum, char *__buf, size_t __buflen) + __THROW __nonnull ((2)); +#define strerror_r __xpg_strerror_r + +# ifdef __INTEL_COMPILER +# include "intel_compiler.h" +# endif +#endif /* __linux__ */ + +#ifndef __GNUC_PREREQ +#define __GNUC_PREREQ(maj,min) 0 +#endif + +#include + +#ifndef _C_LABEL +#define _C_LABEL(x) x +#define _C_LABEL_STRING(x) #x + +#define __strong_alias(alias,sym) \ + __asm(".global " _C_LABEL_STRING(alias) "\n" \ + _C_LABEL_STRING(alias) " = " _C_LABEL_STRING(sym)); + +#define __weak_alias(alias,sym) \ + __asm(".weak " _C_LABEL_STRING(alias) "\n" \ + _C_LABEL_STRING(alias) " = " _C_LABEL_STRING(sym)); + +#define __weak_reference(sym) __attribute__((__weak__)) + +#define __warn_references(sym,msg) \ + __asm(".section .gnu.warning." _C_LABEL_STRING(sym) "\n\t.ascii \"" msg "\"\n\t.text"); +#endif + +#ifndef __predict_true +/* + * GNU C version 2.96 adds explicit branch prediction so that + * the CPU back-end can hint the processor and also so that + * code blocks can be reordered such that the predicted path + * sees a more linear flow, thus improving cache behavior, etc. + * + * The following two macros provide us with a way to use this + * compiler feature. Use __predict_true() if you expect the expression + * to evaluate to true, and __predict_false() if you expect the + * expression to evaluate to false. + * + * A few notes about usage: + * + * * Generally, __predict_false() error condition checks (unless + * you have some _strong_ reason to do otherwise, in which case + * document it), and/or __predict_true() `no-error' condition + * checks, assuming you want to optimize for the no-error case. + * + * * Other than that, if you don't know the likelihood of a test + * succeeding from empirical or other `hard' evidence, don't + * make predictions. + * + * * These are meant to be used in places that are run `a lot'. + * It is wasteful to make predictions in code that is run + * seldomly (e.g. at subsystem initialization time) as the + * basic block reordering that this affects can often generate + * larger code. + */ + +#define __predict_true(exp) __builtin_expect((exp) != 0, 1) +#define __predict_false(exp) __builtin_expect((exp) != 0, 0) +#endif + +#define likely(exp) __predict_true(exp) +#define unlikely(exp) __predict_false(exp) + +#define __PACKED __attribute__((__packed__)) +#define __ALIGNED(x) __attribute__((__aligned__(x))) +/* + * The following provides __arraycount() + */ +#include + +#ifdef __cplusplus +#define __USE(var) static_cast(&(var)) +#define __UNCONST_T(type,var) const_cast(var) +#define __UNVOLATILE_T(type,var) const_cast(var) +#define __STATIC_CAST(type,var) static_cast(var) +#define __VOIDP_CAST(type,var) static_cast(var) +#define __REINTERPRET_CAST(type,var) reinterpret_cast(var) +#else +#define __USE(var) (void)&(var) +#define __UNCONST_T(type,var) ((type)(intptr_t)(const void *)(var)) +#define __UNVOLATILE_T(type,var) ((type)(intptr_t)(volatile void *)(var)) +#define __STATIC_CAST(type,var) ((type)(var)) +#define __VOIDP_CAST(type,var) (var) +#define __REINTERPRET_CAST(type,var) ((type)(var)) +#endif + +#define fieldsizeof(type, field) sizeof(__REINTERPRET_CAST(type *, NULL)->field) + +#define __PUNNED_CAST(type,var) \ + ({ uintptr_t __p = (uintptr_t)(var); __STATIC_CAST(type, __p); }) + +#define __IGNORE(result) \ + __ignore(__STATIC_CAST(unsigned long, result)) + +static __inline void +__ignore(unsigned long result) { + __USE(result); +} + +# ifdef __INTEL_COMPILER +# define PRIGROUP +# else +# define PRIGROUP "'" +# endif + +/** + * force the compiler to write registers to memory, and not to reorder + * memory operations around this statement. + * + * http://software.intel.com/en-us/forums/threading-on-intel-parallel-architectures/topic/65071/ + */ +#define compiler_fence() __asm__ __volatile__ ("" : : : "memory") + +/** + * Force the processor to flush pending writes, and not to reorder instructions + * around this statement. implies compiler_fence() + */ +/* #define processor_fence() _mm_mfence() */ +#define processor_fence() __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" \ + : : : "memory") + +/** + * Support the restrict type qualifier in C++. + * C++ does not include the C99's "restrict" type qualifier, but gcc + * and clang support an equivalent "__restrict__" type qualifier in C++. + */ +#ifdef __cplusplus +#define restrict __restrict__ +#endif + +#endif /* _TWOSIGMA_H_ */ diff --git a/src/wait_for_heartbeat.cc b/src/wait_for_heartbeat.cc new file mode 100644 index 0000000..58785e7 --- /dev/null +++ b/src/wait_for_heartbeat.cc @@ -0,0 +1,86 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "iqueue.hh" +#include +#include +#include +#include +#include + +namespace iqueue = ts::mmia::cpputils; +struct options { + char *iqueue_path; + uint64_t shash_key; + std::experimental::optional latency_bound_ns; +}; + +struct options get_options(int argc, char **argv) +try { + if (argc == 3) { + struct options opt = { + .iqueue_path = argv[1], + .shash_key = std::stoul(argv[2], nullptr, 0), + .latency_bound_ns = std::experimental::nullopt, + }; + return opt; + } else if (argc == 4) { + struct options opt = { + .iqueue_path = argv[1], + .shash_key = std::stoul(argv[2], nullptr, 0), + .latency_bound_ns = std::stoul(argv[3], nullptr, 0), + }; + return opt; + } else { + throw std::runtime_error("Incorrect number of arguments"); + } +} catch (std::exception const& e) { + warnx("Usage: %s [latency_tolerance]", + (argc > 0 ? argv[0] : "wait_for_heartbeat")); + warnx("%s", e.what()); + exit(1); +} + +uint64_t nanos() { + using namespace std::chrono; + return duration_cast(high_resolution_clock::now().time_since_epoch()).count(); +} + +int main(int argc, char **argv) { + struct options opt = get_options(argc, argv); + iqueue::iqueue iq(opt.iqueue_path, iqueue::access_mode::read_only); + // spin until we can get table 0 + shash_t* table = nullptr; + while (!table) { + table = iqueue_writer_table(static_cast(iq), 0, false); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + // spin until we can get the entry for this shash_key + shash_entry_t* entry = nullptr; + while (!entry) { + entry = shash_get(table, opt.shash_key); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + if (opt.latency_bound_ns) { + uint64_t latency_bound_ns = *opt.latency_bound_ns; + // spin until the shash key value meets latency bounds + while (entry->value < (nanos() - latency_bound_ns)) { + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + } + return 0; +} diff --git a/test/copyout.test b/test/copyout.test new file mode 100755 index 0000000..7ab2c0b --- /dev/null +++ b/test/copyout.test @@ -0,0 +1,143 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx +OUT=$WORKDIR/test.out + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +echo USER HEADER | ${PKGDIR}/bin/iqueue \ + --create \ + --header \ + -f $IQ1 \ +|| die "$IQ1: Unable to create" + + +# +# Test the empty case, with a user header +# we cut out the offset column because it can vary in size +# +${PKGDIR}/bin/iqueue -f $IQ1 --copyout | od -x | cut -d' ' -f2- > $OUT +cat < $OUT +cat < $OUT + +cat < $OUT + +cat < $OUT +cat <&2 "$0: Test passed" +exit 0 diff --git a/test/ctest.h b/test/ctest.h new file mode 100644 index 0000000..35d43d8 --- /dev/null +++ b/test/ctest.h @@ -0,0 +1,156 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _CTEST_H +#define _CTEST_H + +#ifdef __cplusplus +extern "C++" { +# include +# include +# include +} +#else +# include +# include +# include +# include +#endif + +__BEGIN_DECLS + +typedef bool (*ts_run_test_func_t)(void); + +typedef struct ts_test_case { + const char *name; + ts_run_test_func_t run; + bool enabled; + struct ts_test_case *next; +} ts_test_case_t; + +// we only define this global pointer in a single translation unit, all others +// pull it in with an extern declaration. +#ifndef CTEST_DEFINE_MAIN +extern +#endif +ts_test_case_t *ts_test_global_test_suite; + +static inline void +ts_add_test_case( + const char *test_name, + ts_run_test_func_t run_test_func, + bool is_enabled) +{ +#ifdef __cplusplus + ts_test_case_t *test_case = reinterpret_cast(malloc(sizeof(*test_case))); +#else + ts_test_case_t *test_case = malloc(sizeof(*test_case)); +#endif + *test_case = (ts_test_case_t) { + test_name, + run_test_func, + is_enabled, + ts_test_global_test_suite + }; + + ts_test_global_test_suite = test_case; +} + +#define TS_TEST_ASSERT_FORMAT(expr, format, ...) \ + do { \ + if (expr) {} else { \ + fprintf(global_std_out_fd, "\tassert(\"%s\") from %s:%d\n", #expr, __FILE__, __LINE__); /* NOLINT */ \ + fprintf(global_std_out_fd, format, ##__VA_ARGS__); /* NOLINT */ \ + fprintf(global_std_out_fd, "\n"); /* NOLINT */ \ + \ + fprintf(global_test_output_fd, "\t\t\"%s\": \n\t\t", __FILE__, __LINE__, #expr); /* NOLINT */ \ + fprintf(global_test_output_fd, format, ##__VA_ARGS__); /* NOLINT */ \ + fprintf(global_test_output_fd, "\n\t\t\n"); /* NOLINT */ \ + \ + return false; \ + } \ + } while (false) + +#define TS_TEST_EQUALS_FORMAT(a, b, format, ...) \ + do { \ + bool _result = ((a) == (b)); \ + if (!_result) { \ + fprintf(global_std_out_fd, "\tequals(\"%s\", \"%s\") from %s:%d\n\t", #a, #b, __FILE__, __LINE__); /* NOLINT */ \ + fprintf(global_std_out_fd, format, ##__VA_ARGS__); /* NOLINT */ \ + fprintf(global_std_out_fd, "\n"); /* NOLINT */ \ + \ + fprintf(global_test_output_fd, "\t\t\"%s\"!=\"%s\"\n\t\t", __FILE__, __LINE__, #a, #b); /* NOLINT */ \ + fprintf(global_test_output_fd, format, ##__VA_ARGS__); /* NOLINT */ \ + fprintf(global_test_output_fd, "\n\t\t\n"); /* NOLINT */ \ + \ + return false; \ + } \ + } while (false) + +// using 'expr' without protective parentheses in the condition here is +// deliberate. This is to enable the compiler to warn about accidental +// assignments instead of comparisons in the test +#define TS_TEST_ASSERT(expr) \ + do { \ + if (expr) {} else { \ + fprintf(global_std_out_fd, "\tassert(\"%s\") from %s:%d\n", #expr, __FILE__, __LINE__); /* NOLINT */ \ + \ + fprintf(global_test_output_fd, "\t\t\"%s\"\n", __FILE__, __LINE__, #expr); /* NOLINT */ \ + return false; \ + } \ + } while (false) + +#define TS_TEST_EQUALS(a, b) \ + do { \ + bool _result = ((a) == (b)); \ + if (!_result) { \ + fprintf(global_std_out_fd, "\tequals(\"%s\", \"%s\") from %s:%d\n\t", #a, #b, __FILE__, __LINE__); /* NOLINT */ \ + \ + fprintf(global_test_output_fd, "\t\t\"%s\"!=\"%s\"\n", __FILE__, __LINE__, #a, #b); /* NOLINT */ \ + return false; \ + } \ + } while (false) + +#define TS_ADD_TEST(_test_name) \ + static inline bool run_##_test_name(void); \ + __attribute__((constructor)) \ + static void add_test_##_test_name(void) { \ + ts_add_test_case(#_test_name, run_##_test_name, true); \ + } \ + static inline bool run_##_test_name() + +#define TS_ADD_DISABLED_TEST(_test_name) \ + static inline bool run_##_test_name(void); \ + __attribute__((constructor)) \ + static inline void disable_test_##_test_name(void) { \ + ts_add_test_case(#_test_name, run_##_test_name, false); \ + } \ + static inline bool run_##_test_name() + +#ifdef CTEST_DEFINE_MAIN + +#include "ctest_main.h" + +#else + +extern FILE *global_test_output_fd; +extern FILE *global_std_out_fd; +extern bool global_ts_test_cleanup_on_exit; + +#endif + +__END_DECLS + +#endif diff --git a/test/ctest_main.h b/test/ctest_main.h new file mode 100644 index 0000000..64cd0d6 --- /dev/null +++ b/test/ctest_main.h @@ -0,0 +1,305 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _CTEST_MAIN_H +#define _CTEST_MAIN_H + +#ifdef __cplusplus +extern "C++" { +# include +# include +} +#else +# include +# include +#endif + +#include + +/************************************************************ + * global + ************************************************************/ +FILE *global_test_output_fd = NULL; +FILE *global_std_out_fd = NULL; +bool global_ts_test_cleanup_on_exit = true; + +/************************************************************ + * options + ************************************************************/ +const char *option_string = "r:o:lqkd:h"; + +#define DEFAULT_USAGE "DEFAULT_USAGE" // icc complains if we use simply usage(NULL) + +static struct option long_options[] = { + { "run", required_argument, NULL, 'r' }, + { "output", required_argument, NULL, 'o' }, + { "list", no_argument, NULL, 'l' }, + { "quiet", no_argument, NULL, 'q' }, + { "keep-log", no_argument, NULL, 'k' }, + { "help", no_argument, NULL, 'h' }, + { 0, 0, 0, 0}, +}; + +static void +__attribute__((noreturn)) +__attribute__((__format__(__printf__, 1, 2))) +usage(const char * error_fmt, ...) +{ + static const char usage_str[] = + "usage: []\n" + "\n" + " --run | -r Run test\n" + " --output | -o Output xml file name\n" + " --list | -l List all tests\n" + " --quiet | -q Disable print\n" + " --keep-log | -k Keep log when test finishes\n" + " --help | -h help\n"; + if (strcmp(error_fmt, DEFAULT_USAGE) != 0) { + va_list ap; + va_start(ap, error_fmt); + vfprintf(stderr, error_fmt, ap); + va_end(ap); + fprintf(stderr, "\n%s", usage_str); + + exit(EXIT_FAILURE); + } + else { + fprintf(stdout, "%s", usage_str); + exit(EXIT_SUCCESS); + } +} + +/************************************************************ + * sub functions + ************************************************************/ + +static bool ts_test_harness_run(const char *name) +{ + if (ts_test_global_test_suite == NULL) { + fprintf(global_std_out_fd, "no test case to run\n"); + return 0; + } + + bool suite_succeeded = true; + int fail_count = 0; + int skip_count = 0; + int success_count = 0; + + fprintf(global_test_output_fd, "\n"); + + for (ts_test_case_t *test_case = ts_test_global_test_suite; + test_case != NULL; test_case = test_case->next) + { + if (name && strcmp(test_case->name, name) != 0) { + continue; + } + + // Only skip the test if it is not asked for specificaly + if (!name && !test_case->enabled) { + fprintf(global_test_output_fd, + "\t\n", + test_case->name); + + fprintf(global_std_out_fd, "SKIPPED Test :%s\n", test_case->name); + + skip_count++; + continue; + } + + fprintf(global_test_output_fd, + "\t\n", + test_case->name); + fprintf(global_std_out_fd, "Starting test: '%s'\n", test_case->name); + + bool ret = test_case->run(); + + fprintf(global_std_out_fd, + "Finished test: '%s', Result: %s\n", + test_case->name, ret ? "SUCCEED":"FAILED"); + + fprintf(global_test_output_fd, "\t\n"); + + if (!ret) { + suite_succeeded = false; + fail_count++; + } else { + success_count++; + } + + if (name) { + break; + } + } + + fprintf(global_std_out_fd, + "Total %d tests: %d succeed, %d failed, %d skipped.\n", + success_count + skip_count + fail_count, + success_count, + fail_count, + skip_count); + + fprintf(global_test_output_fd, "\n"); + + if (name && success_count == 0 && fail_count == 0) { + fprintf(global_std_out_fd, "The test case %s was not found\n", name); + } + + return suite_succeeded; +} + +static void ts_test_harness_list_all_xml(void) +{ + for (ts_test_case_t *test_case = ts_test_global_test_suite; + test_case != NULL; test_case = test_case->next) + { + fprintf(global_test_output_fd, + "\t\n", + test_case->name, test_case->enabled ? "true":"false"); + } +} + +static void ts_test_harness_list_all(void) +{ + for (ts_test_case_t *test_case = ts_test_global_test_suite; + test_case != NULL; test_case = test_case->next) + { + fprintf(global_std_out_fd, + "%s,%s\n", test_case->name, + test_case->enabled ? "enabled" : "disabled"); + } +} + +// Utility functions for a /dev/null FILE stream +static ssize_t dev_null_read(void *cookie, char *buf, size_t size) { + // It's an error to read from this stream + (void)cookie; + (void)buf; + (void)size; + return -1; +} + +static ssize_t dev_null_write(void *cookie, const char *buf, size_t size) { + // Write always succeeds! + (void)cookie; + (void)buf; + return size; +} + +static int dev_null_seek(void *cookie, off64_t *offset, int whence) { + // Cannot seek + (void)cookie; + (void)offset; + (void)whence; + return -1; +} + +static int dev_null_close(void *cookie) { + // Close does nothing + (void)cookie; + return 0; +} + +static FILE *open_dev_null(void) { + return fopencookie(NULL, "w", + (cookie_io_functions_t) { + dev_null_read, + dev_null_write, + dev_null_seek, + dev_null_close + }); +} + +/************************************************************ + * main function + ************************************************************/ +int main(int argc, char **argv) +{ + int option_index = 0; + char *output_file_name = NULL; + bool is_quiet = false; + char *run_test_name = NULL; + bool list_only = false; + bool keep_log = false; + while (1) + { + int c = getopt_long( + argc, + argv, + option_string, + long_options, + &option_index + ); + if (c == -1) + break; + switch (c) { + case 'r': + run_test_name = optarg; + break; + case 'o': + output_file_name = optarg; + break; + case 'l': + list_only = true; + break; + case 'q': + is_quiet = true; + break; + case 'k': + keep_log = true; + break; + case 'h': + usage(DEFAULT_USAGE); + default: + usage("Invalid option '%c'", c); + } + } + if (output_file_name) { + global_test_output_fd = fopen(output_file_name, "w"); + if (!global_test_output_fd) { + fprintf(stderr, "Unable to open output file %s\n", output_file_name); + exit(EXIT_FAILURE); + } + } else { + global_test_output_fd = open_dev_null(); + } + + if (!is_quiet) { + global_std_out_fd = stdout; + } else { + global_std_out_fd = open_dev_null(); + } + + if (keep_log) { + global_ts_test_cleanup_on_exit = false; + fprintf(global_test_output_fd, "Keep test log for debugging!"); + } else { + global_ts_test_cleanup_on_exit = true; + fprintf(global_test_output_fd, "Will cleanup test data"); + } + + if (list_only) { + ts_test_harness_list_all_xml(); + ts_test_harness_list_all(); + return 0; + } + + if (!ts_test_harness_run(run_test_name)) { + return -1; + } + + return 0; +} + +#endif // _CTEST_MAIN_H diff --git a/test/ctest_resource.c b/test/ctest_resource.c new file mode 100644 index 0000000..7363098 --- /dev/null +++ b/test/ctest_resource.c @@ -0,0 +1,161 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" +#include "err.h" +#include "tslog.h" + +#define CTEST_DEFINE_MAIN + +#include "ctest.h" +#include "ctest_resource.h" +#include "tsclock.h" + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +/************************************************************ + * global + *************************************************************/ + +ts_test_resource_t *test_resources = NULL; +ts_test_resource_t *test_resource = NULL; + +static void ts_test_resource_dir_cleanup(void *data) +{ + if (!data) + return; + TSLOGX(TSDEBUG, "clean up dir %s", (char *)data); + char *buf; + if (asprintf(&buf, "rm -rf %s", (char *)data) <= 1) { + err(1, NULL); + }; + int ret = system(buf); + __USE(ret); + free(data); +} +char * ts_test_resource_get_dir(const char *testcase_name) +{ + char *temp_dir = getenv("TMPDIR"); + + // Make a template for unique directory name + char *template; + if (asprintf(&template, "%s/%s-%d-XXXXXX", (temp_dir == NULL) ? "/tmp" : temp_dir, + testcase_name, (int)getpid()) <= 0) + return NULL; + + // create a unique temp directory using the template + if (mkdtemp(template) != NULL) { + // set the temp directory permission to 755 + chmod(template, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH); + // clean up temp directory when exit + TS_ON_DESTROY(template, ts_test_resource_dir_cleanup); + return template; + } + else + return NULL; +} + +static void ts_test_resource_clean_up(void) +{ + if (!global_ts_test_cleanup_on_exit) { + return; + } + + test_resource = test_resources; + while (test_resource != NULL) { + test_resource->cleanup(test_resource->data); + ts_test_resource_t *last = test_resource; + test_resource = test_resource->next; + free(last); + } + if (global_test_output_fd) { + fclose(global_test_output_fd); + global_test_output_fd = NULL; + } + test_resources = NULL; +} + +void ts_test_local_resource_clean_up(void) +{ + test_resource = test_resources; + while (test_resource != NULL) { + test_resource->cleanup(test_resource->data); + ts_test_resource_t *last = test_resource; + test_resource = test_resource->next; + free(last); + } + test_resources = NULL; +} + +static void +signal_handler( + int signum, + siginfo_t * const siginfo, + void * const context +) +{ + __USE(signum); + __USE(siginfo); + __USE(context); + + ts_test_resource_clean_up(); + + abort(); +} + +void signal_handler_install(void) { + static int installed; + if(installed) + return; + + static const struct sigaction sigact = { + .sa_sigaction = signal_handler, + .sa_flags = SA_SIGINFO, + }; + + /* + * JVM uses signal for normal process and the old way (always install the handler + * is incompatible with it. So we only install the handler when there is no previous + * handler installed + */ + + int signals[] = { SIGSEGV, SIGBUS, SIGILL, SIGPIPE, SIGABRT, SIGTERM }; + struct sigaction prev; + + for (size_t i = 0; i < sizeof(signals)/sizeof(signals[0]); ++i) { + memset(&prev, 0, sizeof(prev)); + + int signo = signals[i]; + + sigaction(signo, &sigact, &prev); + + if (prev.sa_handler != SIG_DFL && prev.sa_handler != SIG_IGN) { + sigaction(signo, &prev, NULL); + } + } + atexit(ts_test_resource_clean_up); + installed = 1; +} diff --git a/test/ctest_resource.h b/test/ctest_resource.h new file mode 100644 index 0000000..86a9171 --- /dev/null +++ b/test/ctest_resource.h @@ -0,0 +1,43 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _TEST_RESOURCE_H +#define _TEST_RESOURCE_H + +#include + +typedef struct ts_test_resource { + void *data; + void (*cleanup)(void *); + struct ts_test_resource *next; +} ts_test_resource_t; + +__BEGIN_DECLS +extern ts_test_resource_t *test_resources; +extern ts_test_resource_t *test_resource; + +#define TS_ON_DESTROY(obj, func) \ + signal_handler_install(); \ + ts_test_resource_t *new_resource = calloc(1, sizeof(ts_test_resource_t)); \ + new_resource->data = obj; \ + new_resource->cleanup = func;\ + new_resource->next = test_resources; \ + test_resources = new_resource; \ + +char * ts_test_resource_get_dir(const char *); +void ts_test_local_resource_clean_up(void); +void signal_handler_install(void); +__END_DECLS +#endif diff --git a/test/grow_ctest.c b/test/grow_ctest.c new file mode 100644 index 0000000..c250b7b --- /dev/null +++ b/test/grow_ctest.c @@ -0,0 +1,93 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Create multiple threads all allocating new spcae in the iqueue + * to stress the sparse growing routines. + */ +#include "twosigma.h" + +#include "ctest.h" +#include "ctest_resource.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "iqueue.h" + + + +static void * +allocate_thread( + void * const iq_ptr +) +{ + iqueue_t * const iq = iq_ptr; + uint64_t count = 0; + + while (1) + { + iqueue_msg_t msg; + const void * buf = iqueue_allocate_raw(iq, 1 << 16, &msg); + if (!buf) + break; + count++; + } + + TSLOGXL(TSINFO, "Created %"PRIu64" sections", count); + return NULL; +} + + + +TS_ADD_TEST(test) +{ + tslevel = TSDEBUG; + + char *dir = ts_test_resource_get_dir("iqueue_grow"); + char *iqx_file = NULL; + if (asprintf(&iqx_file, "%s/test.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + iqueue_t * const iq = iqueue_create(iqx_file, 0, NULL, 0); + TS_TEST_ASSERT(iq); + + TSLOGXL(TSINFO, "%s: Finished map", iqx_file); + + const int num_threads = 8; + pthread_t threads[num_threads]; + for (int i = 0 ; i < num_threads ; i++) + { + if (pthread_create(&threads[i], NULL, allocate_thread, iq) < 0) + TSABORT("thread create"); + } + + for (int i = 0 ; i < num_threads ; i++) + pthread_join(threads[i], NULL); + + unlink(iqx_file); + + return true; +} diff --git a/test/heartbeat-writeback-test.c b/test/heartbeat-writeback-test.c new file mode 100644 index 0000000..2321c14 --- /dev/null +++ b/test/heartbeat-writeback-test.c @@ -0,0 +1,78 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Test interaction between the writeback thread and the iqueue heartbeats. + * Goal: determine if the writeback can cause gaps. + */ +#include "twosigma.h" +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "iqueue.h" +#include "ftrace.h" + +int main(int argc, char ** argv) +{ + ftrace_open(); + ftrace_start(); + + if (argc <= 1) + TSABORTX("usage: %s /path/to/test.iqx", argv[0]); + + const char * const filename = argv[1]; + iqueue_t * const iq = iqueue_create( + filename, + 0, + NULL, + 0 + ); + + if (!iq) + TSABORTX("%s: unable to create", filename); + + shash_t * const sh = iqueue_writer_table(iq, 0, 1); + if (!sh) + TSABORTX("%s: unable to create write table", filename); + + shash_entry_t * const entry = shash_insert_or_get(sh, 0xDECAFBAD, 0); + if (!entry) + TSABORTX("%s: unable to create writer", filename); + int check = 0; + + for (uint64_t i = 0 ; ; i++) + { + const uint64_t now = tsclock_getnanos(0); + if (check) + { + const uint64_t old = entry->value; + ftrace_mark("gap %"PRIu64" ns", now - old); + ftrace_stop(); + if (now - old > 100e3) + TSLOGXL(TSINFO, "%"PRIu64" gap", now - old); + check = 0; + } + + iqueue_writer_update(sh, entry, now); + check = 1; + + if ((i & (0xFFFFF)) == 0) + iqueue_append(iq, &now, sizeof(now)); + } + + //unlink(filename); + return 0; +} diff --git a/test/iqmod_ctest.c b/test/iqmod_ctest.c new file mode 100644 index 0000000..8557884 --- /dev/null +++ b/test/iqmod_ctest.c @@ -0,0 +1,87 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" + +#include "ctest.h" +#include "ctest_resource.h" +#include +#include "sys/wait.h" + +static inline bool +exec(char *cmd, const char *expected_output) +{ + FILE *proc = popen(cmd, "re"); + TS_TEST_ASSERT(proc); + char* output = NULL; + size_t n = 0; + errno = 0; + TS_TEST_ASSERT(getdelim(&output, &n, 0, proc) < 0 || errno == 0); + pclose(proc); + + if (expected_output) { + TS_TEST_ASSERT(strstr(output, expected_output) != NULL); + } + free(output); + return true; +} + +TS_ADD_TEST(test) +{ + char *dir = ts_test_resource_get_dir("iqmod"); + + char *test_iqx = NULL; + TS_TEST_ASSERT(asprintf(&test_iqx, "%s/test.iqx", dir) != -1); + + char cmd[2000]; + + // create empty iqueue + TS_TEST_ASSERT(sprintf(cmd, "./iqueue -C -f %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + + // mod creation_time + TS_TEST_ASSERT(sprintf(cmd, "./iqmod_inplace -c 12345 %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + TS_TEST_ASSERT(sprintf(cmd, "./iqueue -s -f %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, "12345")); + + // mod table in place + TS_TEST_ASSERT(sprintf(cmd, "./iqmod_inplace -s 0:987654321:1500000000000000000 %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + TS_TEST_ASSERT(sprintf(cmd, "./iqueue -s -f %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, "3ade68b1: 1500000000000000000")); + TS_TEST_ASSERT(sprintf(cmd, "./iqmod_inplace -s 0:987654321:1500000000000000001 %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + TS_TEST_ASSERT(sprintf(cmd, "./iqueue -s -f %s", test_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, "3ade68b1: 1500000000000000001")); + + // mod table copy + char *mod_iqx = NULL; + TS_TEST_ASSERT(asprintf(&mod_iqx, "%s/mod.iqx", dir) != -1); + + TS_TEST_ASSERT(sprintf(cmd, "./iqmod_copy -s 0:987654322:1500000000000000000 %s %s", test_iqx, mod_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + TS_TEST_ASSERT(sprintf(cmd, "./iqueue -s -f %s", mod_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, "3ade68b2: 1500000000000000000")); + TS_TEST_ASSERT(sprintf(cmd, "rm -rf %s", mod_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + + TS_TEST_ASSERT(sprintf(cmd, "./iqmod_copy -x 0:987654322 %s %s", test_iqx, mod_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, NULL)); + TS_TEST_ASSERT(sprintf(cmd, "./iqueue -s -f %s", mod_iqx) > 0); + TS_TEST_ASSERT(exec(cmd, "3ade68b1: 1500000000000000001")); + + return true; +} diff --git a/test/iqsync-bidirectional.test b/test/iqsync-bidirectional.test new file mode 100755 index 0000000..7c01fc1 --- /dev/null +++ b/test/iqsync-bidirectional.test @@ -0,0 +1,160 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test multiple bi-directional iqsyncs to the same queues. + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +echo HEADER | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +echo HEADER | ${PKGDIR}/bin/iqueue \ + -f $IQ2 \ + --create \ + --header \ +|| die "$IQ2: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + < iq2" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --push \ + --pull \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ1 \ + > $WORKDIR/iqsync.1.2.log \ + 2>&1 \ + & + +log "iq2 -> iq1" +$PKGDIR/bin/iqsync \ + -f $IQ1 \ + --push \ + --pull \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ2 \ + > $WORKDIR/iqsync.2.1.log \ + 2>&1 \ + & + +sleep 2 + +log "Updating writer status" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --writer 3,1 \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + --writer 4,1 \ + +sleep 2 +log "Adding items to iq1" +for i in `seq 1 5`; do + $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --writer 5,$i \ + + seq 1 200000 | $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + + sleep 2 +done + +log "Adding items to iq2" +for i in `seq 1 5`; do + $PKGDIR/bin/iqueue \ + -f $IQ2 \ + --writer 6,$i \ + + seq 1 200000 | $PKGDIR/bin/iqueue \ + -f $IQ2 \ + --line \ + + sleep 2 +done + +log "Waiting" +sleep 20 + +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "sync failed" + +for iq in $IQ1 $IQ2; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + +pkill -P $$ diff --git a/test/iqsync-buffer.test b/test/iqsync-buffer.test new file mode 100755 index 0000000..195add3 --- /dev/null +++ b/test/iqsync-buffer.test @@ -0,0 +1,204 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# +# Test iqsync using tcp +# + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx +IQ3=$WORKDIR/iq3.iqx +IQ4=$WORKDIR/iq4.iqx + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +echo Header1 | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +for n in {1..100000}; do echo "iqueue message $n";done | $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line + +################# +# +# Test clone into an new iqueue with recv buffer. +# Should succeed. +# +log "iq1 clone into iq2 with recv buffer" + +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --clone \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + --recv-buffer \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.1.1.log \ + 2>&1 \ +|| die "iqsync --clone -f iq2 iq1 (new)" + +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "sync failed" + +################ +# +# Test push as well. +# Should succeed. +# +log "iq2 push into iq1" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --push \ + --validate \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + --recv-buffer \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.1.2.log \ + 2>&1 \ +|| die "iqsync --push --validate -f iq2 iq1 (existing)" + + +################# +# +# Test clone into an new iqueue with send buffer. +# Should succeed. +# +log "iq1 clone into iq3 with send buffer" + +$PKGDIR/bin/iqsync \ + -f $IQ3 \ + --clone \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + --send-buffer \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.2.1.log \ + 2>&1 \ +|| die "iqsync --clone -f iq3 iq1 (new)" + +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + > $WORKDIR/iq3.txt \ + +diff $WORKDIR/iq{1,3}.txt || die "sync failed" + +################# +# +# Test clone into an new iqueue with send buffer. +# Should succeed. +# +log "iq1 clone into iq3 with send buffer" + +$PKGDIR/bin/iqsync \ + -f $IQ3 \ + --clone \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + --send-buffer \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.2.log \ + 2>&1 \ +|| die "iqsync --clone -f iq3 iq1 (new)" + +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + > $WORKDIR/iq3.txt \ + +diff $WORKDIR/iq{1,3}.txt || die "sync failed" + +################ +# +# Test push as well. +# Should succeed. +# +log "iq3 push into iq1" +$PKGDIR/bin/iqsync \ + -f $IQ3 \ + --push \ + --validate \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + --recv-buffer \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.2.2.log \ + 2>&1 \ +|| die "iqsync --push --validate -f iq3 iq1 (existing)" + + +log "Waiting" +sleep 10 + +for iq in $IQ1 $IQ2 $IQ3; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + + +pkill -P $$ +exit 0 diff --git a/test/iqsync-cascade.test b/test/iqsync-cascade.test new file mode 100755 index 0000000..ee64990 --- /dev/null +++ b/test/iqsync-cascade.test @@ -0,0 +1,179 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test iqsync with lots of different hop configurations, +# including a direct cycle + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx +IQ3=$WORKDIR/iq3.iqx +IQ4=$WORKDIR/iq4.iqx + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +echo HEADER | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + < iq2" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --clone \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ1 \ + > $WORKDIR/iqsync.1.2.log \ + 2>&1 \ + & + +log "iq1 -> iq3" +$PKGDIR/bin/iqsync \ + -f $IQ3 \ + --clone \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ1 \ + > $WORKDIR/iqsync.1.3.log \ + 2>&1 \ + & + +sleep 2 + +log "iq2 -> iq4" +$PKGDIR/bin/iqsync \ + -f $IQ4 \ + --clone \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ2 \ + > $WORKDIR/iqsync.2.4.log \ + 2>&1 \ + & + +log "iq3 -> iq4" +$PKGDIR/bin/iqsync \ + -f $IQ4 \ + --clone \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ3 \ + > $WORKDIR/iqsync.3.4.log \ + 2>&1 \ + & + +log "iq4 -> iq1" +$PKGDIR/bin/iqsync \ + -f $IQ1 \ + --clone \ + --verbose \ + --report-interval 1 \ + --sleep 100 \ + --tail \ + localhost:$IQ4 \ + > $WORKDIR/iqsync.4.1.log \ + 2>&1 \ + & + +sleep 2 + +log "Updating writer status" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --writer 3,1 \ + +sleep 2 +log "Adding items" +for i in `seq 1 10`; do + $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --writer 4,$i \ + + seq 1 200000 | $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + + sleep 2 +done + + +log "Waiting" +sleep 20 +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ4 \ + > $WORKDIR/iq4.txt \ + +diff $WORKDIR/iq{1,4}.txt || die "sync failed" + +for iq in $IQ1 $IQ2 $IQ3 $IQ4; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + +pkill -P $$ diff --git a/test/iqsync-filter.test b/test/iqsync-filter.test new file mode 100755 index 0000000..853618a --- /dev/null +++ b/test/iqsync-filter.test @@ -0,0 +1,218 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# +# Test iqsync filters +# + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/buildpath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +IQSYNC=${PKGDIR}/bin/iqsync +IQUEUE=${PKGDIR}/bin/iqueue + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +echo Using $WORKDIR +SRC_IQ=$WORKDIR/src.iqx +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx + +echo Header1 | $IQUEUE \ + -f $SRC_IQ \ + --create \ + --header \ +|| die "$SRC_IQ: Unable to create" + +LEGACY_FILTER_SRC=${WORKDIR}/filter_legacy.c +LEGACY_FILTER_LIB=${WORKDIR}/libfilter_legacy.so +FILTER_SRC=$WORKDIR/filter.c +FILTER_LIB=${WORKDIR}/libfilter.so + +for n in {1..100}; do echo "$n";done | $IQUEUE \ + -f $SRC_IQ \ + --line + +$IQUEUE \ + -f $SRC_IQ \ + > $WORKDIR/src.txt +################ +# IQSync using legacy filter +# +log "creating legacy filter shared library" +cat >$LEGACY_FILTER_SRC < +#include +#include + +typedef int (*filter_fn_t)(void *handle, const void *buf, size_t len); + +static int +filter(void *handle, const void *buf, size_t len) +{ + int *index = handle; + + if ((*index)++ % 2 == 0) { + return 1; + } else { + return 0; + } +} + +int +setup( + const void *iq_header, + size_t iq_hdr_len, + void **filter_priv_data, + filter_fn_t *filter_fn) +{ + int *index = calloc(1, sizeof(*index)); + + *filter_priv_data = index; + *filter_fn = filter; + + return 0; +} +EOF + +gcc -o ${LEGACY_FILTER_LIB} -shared -fpic ${LEGACY_FILTER_SRC} + +export IQSYNC_CMD="${IQSYNC} --filter ${LEGACY_FILTER_LIB}:setup" + +log "iqsync src into iq1" +$IQSYNC \ + -f $IQ1 \ + --clone \ + --verbose \ + --sleep 100 \ + localhost:$SRC_IQ \ + >> $WORKDIR/iqsync.1.log \ + 2>&1 \ +|| die "iqsync --clone -f src iq1" + +$IQUEUE \ + -f $IQ1 \ + > $WORKDIR/iq1.txt + +################ +# IQSync using filter +# +log "creating filter shared library" +cat >$FILTER_SRC < +#include +#include +#include "tsproperties.h" + +typedef int (*filter_fn_t)(void *handle, const void *buf, size_t len); + +typedef struct _filter_data +{ + int msg_index; + + int id; + int total_len; +} filter_data_t; + +static int +filter(void *handle, const void *buf, size_t len) +{ + filter_data_t *data = handle; + + if ((data->msg_index)++ % data->total_len == data->id) { + return 1; + } else { + return 0; + } +} + +int +setup( + const void *iq_header, + size_t iq_hdr_len, + void **filter_priv_data, + filter_fn_t *filter_fn, + tsproperties_t *props) +{ + filter_data_t *data = calloc(1, sizeof(*data)); + + data->id = atoi(tsproperties_get(props, "index")); + data->total_len = atoi(tsproperties_get(props, "total_len")); + + *filter_priv_data = data; + *filter_fn = filter; + + return 0; +} +EOF + +gcc -I${TS_LANG_C_BASE_HOME}/c/include -I${TS_LANG_C_UTIL_HOME}/c/include -o ${FILTER_LIB} -shared -fpic ${FILTER_SRC} + +export IQSYNC_CMD="${IQSYNC} --filter ${FILTER_LIB}:setup?index=0&total_len=2" + +log "iqsync src into iq2" +$IQSYNC \ + -f $IQ2 \ + --clone \ + --verbose \ + --sleep 100 \ + localhost:$SRC_IQ \ + >> $WORKDIR/iqsync.2.log \ + 2>&1 \ +|| die "iqsync --clone -f src iq2" + +$IQUEUE \ + -f $IQ2 \ + > $WORKDIR/iq2.txt + +diff $WORKDIR/iq{1,2}.txt || die "filter dismatch" +[ $(cat ${WORKDIR}/iq1.txt|wc -l) -eq 51 ] || die "filter failed" +################ +# clean up +# +log "Waiting" +sleep 1 + +for iq in $SRC_IQ $IQ1 $IQ2; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + +pkill -P $$ +exit 0 diff --git a/test/iqsync-latency-test.c b/test/iqsync-latency-test.c new file mode 100644 index 0000000..35a4674 --- /dev/null +++ b/test/iqsync-latency-test.c @@ -0,0 +1,124 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" +#include +#include "tslog.h" +#include "iqueue.h" +#include "tsclock.h" +#include "segfault.h" + + +static inline uint64_t +write_timestamp( + iqueue_t * const iq +) +{ + iqueue_msg_t iqmsg; + uint64_t * const msg = iqueue_allocate_raw(iq, 2 * sizeof(*msg), &iqmsg); + if (!msg) + TSABORTX("allocate failed"); + uint64_t ts = msg[0] = tsclock_getnanos(0); + if (iqueue_update_be(iq, iqmsg, &msg[1]) != 0) + TSABORTX("update failed"); + + return ts; +} + + +static inline uint64_t +read_timestamp( + iqueue_t * const iq, + uint64_t iq_index +) +{ + size_t len; + + while (1) + { + int status = iqueue_status_wait(iq, iq_index, 1); + if (status == IQUEUE_STATUS_HAS_DATA) + return *(const uint64_t*) iqueue_data(iq, iq_index, &len); + } +} + + +int +main( + int argc, + char ** argv +) +{ + __USE(argc); + segfault_handler_install(); + + const char * const iq1_name = "/tmp/iq1.iqx"; + const char * const iq2_name = "/tmp/iq2.iqx"; + + iqueue_t * const iq1 = iqueue_create(iq1_name, 0, NULL, 0); + if (!iq1) + TSABORT("iq1"); + + iqueue_t * const iq2 = iqueue_create(iq2_name, 0, NULL, 0); + if (!iq2) + TSABORT("iq2"); + + uint64_t read_ts, write_ts, delta1 = 0, delta2 = 0; + + printf("Run: %s --tail --push --pull -f %s localhost:%s\n", + argv[0], + iq1_name, + iq2_name + ); + + write_ts = write_timestamp(iq1); + read_ts = read_timestamp(iq2, 0); + + TSLOGXL(TSINFO, "Received ack: %"PRIu64, tsclock_getnanos(0) - read_ts); + + const uint64_t max_iter = 1 << 20; + + for (uint64_t i = 1 ; i < max_iter ; i += 2) + { + // Check one-way (from 1 into 2) + write_ts = write_timestamp(iq1); + read_ts = read_timestamp(iq2, i); + + if (write_ts != read_ts) + TSABORTX("write %"PRIu64" != read %"PRIu64, read_ts, write_ts); + delta1 += tsclock_getnanos(0) - write_ts; + + // Check ping pong (from 2 back into 1) + write_timestamp(iq2); + read_timestamp(iq1, i+1); + + delta2 += tsclock_getnanos(0) - write_ts; + + if ((i & 0xFFFF) != 0xFFFF) + continue; + + printf("one way: total %"PRIu64" avg %"PRIu64" ns\n", + delta1, + delta1 / i / 2 + ); + + printf("two way: total %"PRIu64" avg %"PRIu64" ns\n", + delta2, + delta2 / i / 2 + ); + } + + return 0; +} diff --git a/test/iqsync-multi.test b/test/iqsync-multi.test new file mode 100755 index 0000000..2636e40 --- /dev/null +++ b/test/iqsync-multi.test @@ -0,0 +1,143 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test multiple iqsync's writing into the same iqueue, reading from +# the same iqueue. + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +echo HEADER | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + < $WORKDIR/iqsync.$i.log \ + 2>&1 \ + & +done + +sleep 2 +$PKGDIR/bin/iqueue -f $IQ2 --watch & +watchpid=$! + +log "Updating writer status" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --writer 3,1 \ + +sleep 2 +log "Adding items" +for i in `seq 1 10`; do + $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --writer 4,$i \ + + seq 1 1000000 | $PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + + sleep 2 +done + +cat > /dev/null \ + < $iq.txt \ + + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + +for iq in $IQ1 $IQ2; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --debug -1 \ + > $iq.debug \ + +done + +diff $WORKDIR/iq{1,2}.iqx.txt || die "second sync failed" diff --git a/test/iqsync-pingpong-test.c b/test/iqsync-pingpong-test.c new file mode 100644 index 0000000..5c5b1fb --- /dev/null +++ b/test/iqsync-pingpong-test.c @@ -0,0 +1,185 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Test ping-pong latency between two iqueues synchronized with iqsync + * in bi-directional mode. + * + * On machine1, run: + * iqsync-pingpong-test ping + * + * On machine2, run: + * iqsync-pingpong-test pong + * + * Then on machine1, run: + * iqsync --sleep 0 --push --pull -f /tmp/ping.iqx machine2:/tmp/pong.iqx + * + */ +#include "twosigma.h" +#include +#include +#include "tslog.h" +#include "iqueue.h" +#include "tsclock.h" +#include "segfault.h" + + +static inline uint64_t +write_timestamp( + iqueue_t * const iq, + uint64_t reply +) +{ + iqueue_msg_t iqmsg; + uint64_t * const msg = iqueue_allocate_raw(iq, 3 * sizeof(*msg), &iqmsg); + if (!msg) + TSABORTX("allocate failed"); + uint64_t ts = msg[0] = tsclock_getnanos(0); + msg[2] = reply; + + if (iqueue_update_be(iq, iqmsg, &msg[1]) != 0) + TSABORTX("update failed"); + + return ts; +} + + +static inline const uint64_t * +read_timestamp( + iqueue_t * const iq, + uint64_t iq_index +) +{ + size_t len; + + while (1) + { + const uint64_t * const msg = iqueue_data(iq, iq_index, &len); + if (msg) + return msg; + } +} + + +int +main( + int argc, + char ** argv +) +{ + segfault_handler_install(); + + if (argc <= 1) + TSABORTX("Usage: %s (ping|pong) [/path/to/iqx]", argv[0]); + + const char * name; + int do_ping; + if (strcmp(argv[1], "ping") == 0) + { + name = "PING"; + do_ping = 1; + } else + if (strcmp(argv[1], "pong") == 0) + { + name = "PONG"; + do_ping = 0; + } else + TSABORTX("Must be ping or pong"); + + const char * iq_name; + if (argc > 2) + iq_name = argv[2]; + else + if (do_ping) + iq_name = "/tmp/ping.iqx"; + else + iq_name = "/tmp/pong.iqx"; + + unlink(iq_name); + iqueue_t * const iq = iqueue_create(iq_name, tsclock_getnanos(0), NULL, 0); + if (!iq) + TSABORT("%s: failed", name); + + uint64_t write_ts, read_ts; + const uint64_t * reply; + + if (do_ping) + { + // Send the first message and wait for the response + write_ts = write_timestamp(iq, 0); + reply = read_timestamp(iq, 1); + read_ts = reply[2]; + + if (write_ts != read_ts) + TSABORTX("Invalid reply: %"PRIu64" != %"PRIu64, write_ts, read_ts); + + TSLOGXL(TSINFO, "%s: Received ack: %"PRIu64, + name, + tsclock_getnanos(0) - write_ts + ); + } else { + // Read the first message and send a response + reply = read_timestamp(iq, 0); + TSLOGXL(TSINFO, "%s: saw first message", name); + + write_timestamp(iq, reply[0]); + } + + + const uint64_t max_iter = 1 << 20; + + uint64_t sum = 0; + uint64_t sum2 = 0; + uint64_t count = 0; + + for (uint64_t i = 2 ; i < max_iter ; i += 2) + { + if (!do_ping) + { + // Pong just waits for a message and then sends reply + reply = read_timestamp(iq, i); + write_timestamp(iq, reply[0]); + continue; + } + + // Check round trip (from us to them and back) + write_ts = write_timestamp(iq, 0); + reply = read_timestamp(iq, i + 1); + read_ts = reply[2]; + + if (write_ts != read_ts) + { + TSHDUMPL(TSINFO, reply, 32); + TSABORTX("write %"PRIu64" != read %"PRIu64, read_ts, write_ts); + } + uint64_t delta = tsclock_getnanos(0) - write_ts; + sum += delta; + sum2 += delta * delta; + + if ((count++ & 0xFFFF) != 0xFFFF) + continue; + + uint64_t avg = sum / count; + + printf("round trip: total %"PRIu64" avg %"PRIu64" ns\n", + sum, + avg + ); + + count = sum = sum2 = 0; + } + + return 0; +} diff --git a/test/iqsync-pushpull.test b/test/iqsync-pushpull.test new file mode 100755 index 0000000..ce5fbad --- /dev/null +++ b/test/iqsync-pushpull.test @@ -0,0 +1,172 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +iqueue-append() { + $PKGDIR/bin/iqueue \ + --append \ + -f $1 \ + || die "$1: Unable to append" +} + + +${PKGDIR}/bin/iqueue \ + --create \ + -f $IQ1 \ +|| die "$IQ1: Unable to create" + +${PKGDIR}/bin/iqueue \ + --create \ + -f $IQ2 \ +|| die "$IQ2: Unable to create" + +echo "iq1 first entry" | iqueue-append $IQ1 +echo "iq1 second entry" | iqueue-append $IQ1 +echo "iq2 first entry" | iqueue-append $IQ2 +echo "iq2 second entry" | iqueue-append $IQ2 + + +# +# Test the non-tail case in bidirectional, which should just exchange the +# two items +# +${PKGDIR}/bin/iqsync \ + -f $IQ2 \ + --sleep 100 \ + --report-interval 5 \ + --pull \ + --push \ + --verbose \ + localhost:$IQ1 \ + + +${PKGDIR}/bin/iqueue -f $IQ1 -b > $WORKDIR/iq1.txt +cat < $WORKDIR/iq2.txt +cat </dev/null; then + die "iqsync did not exit on its own" +fi + +warn "Everything exited correctly. Checking results" + +${PKGDIR}/bin/iqueue -f $IQ2 -b > $WORKDIR/iq2.txt +cat <&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +iqueue-append() { + $PKGDIR/bin/iqueue \ + --append \ + -f $1 \ + || die "$1: Unable to append" +} + + +echo HEADER | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + < $WORKDIR/iq1.txt \ + +# Run the first iqsync; should copy everything +log "sync iq2 from iq1" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --clone \ + --verbose \ + localhost:$IQ1 \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "first sync failed" + +# Run the iqsync again; should be a nop +log "sync iq2 from iq1 (nop)" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --clone \ + --verbose \ + localhost:$IQ1 \ + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "second sync failed" + +# Add a few items to the first queue and update the heartbeat +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + < $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "third sync failed" + +# Generate iq3, with new data and sync it into iq1 and iq2 +log "create iq3" +echo HEADER | ${PKGDIR}/bin/iqueue \ + -f $IQ3 \ + --create \ + --header \ +|| die "$IQ3: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + --line \ + < $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "fourth sync failed" diff --git a/test/iqsync-tcp.test b/test/iqsync-tcp.test new file mode 100755 index 0000000..02d56f9 --- /dev/null +++ b/test/iqsync-tcp.test @@ -0,0 +1,234 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# +# Test iqsync using tcp +# + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx +IQ3=$WORKDIR/iq3.iqx +IQ4=$WORKDIR/iq4.iqx + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + +echo Header1 | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + <> $WORKDIR/iqsync.1.2.log \ + 2>&1 \ +|| die "iqsync --clone -f iq2 iq1 (new)" + +log "Checking iqueues (first sync)" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "sync failed" + + +################# +# +# Test clone into an existing iqueue with the same header. +# Should succeed. +# +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + <> $WORKDIR/iqsync.1.2.log \ + 2>&1 \ +|| die "iqsync --clone -f iq2 iq1 (existing)" + +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "sync failed" + + +################ +# +# Test push as well. +# Should succeed. +# +log "iq2 push into iq1" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --push \ + --validate \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.2.1.log \ + 2>&1 \ +|| die "iqsync --push --validate -f iq2 iq1 (existing)" + + + +################### +# +# Create a new iqueue with invalid header +# + +log "iq1 clone into existing iq3 with bad header" +echo BadHeader | ${PKGDIR}/bin/iqueue \ + -f $IQ3 \ + --create \ + --header \ +|| die "$IQ3: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + --line \ + <> $WORKDIR/iqsync.1.3.log \ + 2>&1 \ +&& die "$IQ3: Should have aborted due to bad header" + +################ +# +# Test push as well. +# Should fail. +# +log "iq2 push into iq3" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --push \ + --validate \ + --verbose \ + --sleep 100 \ + --type tcp \ + --launch-server \ + localhost:$IQ3 \ + >> $WORKDIR/iqsync.2.3.log \ + 2>&1 \ +&& die "iqsync --push --validate -f iq2 iq3 (existing) should have failed" + + + +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + > $WORKDIR/iq3.txt \ + +log "Waiting" +sleep 10 + +for iq in $IQ1 $IQ2 $IQ3; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + +pkill -P $$ +exit 0 diff --git a/test/iqsync-verify.test b/test/iqsync-verify.test new file mode 100755 index 0000000..37e6923 --- /dev/null +++ b/test/iqsync-verify.test @@ -0,0 +1,224 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Test header verification of iqsync + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../..; pwd -L) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +WORKDIR="$(mktemp -d -t $PROG.XXXXXXXXXX)" +DCATSIMPLE=${TS_DCAT_NATIVE_HOME}/bin/dcatsimple + +#set -x +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +die() { + echo >&2 "$@" + exit 1 +} + +warn() { + echo >&2 "$@" +} + +log() { + echo >&2 "--------- $@ -----------" +} + +warn Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx +IQ2=$WORKDIR/iq2.iqx +IQ3=$WORKDIR/iq3.iqx + +export IQSYNC_CMD=${PKGDIR}/bin/iqsync +export IQSYNC_RSH=${PKGDIR}/sh/nop-ssh + + +echo Header1 | ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ + --header \ +|| die "$IQ1: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + <> $WORKDIR/iqsync.1.2.log \ + 2>&1 \ +|| die "iqsync --clone -f iq2 iq1 (new)" + +log "Checking iqueues (first sync)" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "sync failed" + + +################# +# +# Test clone into an existing iqueue with the same header. +# Should succeed. +# +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + --line \ + <> $WORKDIR/iqsync.1.2.log \ + 2>&1 \ +|| die "iqsync --clone -f iq2 iq1 (existing)" + +log "Checking iqueues" +$PKGDIR/bin/iqueue \ + -f $IQ1 \ + > $WORKDIR/iq1.txt \ + +$PKGDIR/bin/iqueue \ + -f $IQ2 \ + > $WORKDIR/iq2.txt \ + +diff $WORKDIR/iq{1,2}.txt || die "sync failed" + + +################ +# +# Test push as well. +# Should succeed. +# +log "iq2 push into iq1" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --push \ + --validate \ + --verbose \ + --sleep 100 \ + localhost:$IQ1 \ + >> $WORKDIR/iqsync.2.1.log \ + 2>&1 \ +|| die "iqsync --push --validate -f iq2 iq1 (existing)" + + + +################### +# +# Create a new iqueue with invalid header +# + +log "iq1 clone into existing iq3 with bad header" +echo BadHeader | ${PKGDIR}/bin/iqueue \ + -f $IQ3 \ + --create \ + --header \ +|| die "$IQ3: Unable to create" + +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + --line \ + <> $WORKDIR/iqsync.1.3.log \ + 2>&1 \ +&& die "$IQ3: Should have aborted due to bad header" + +################ +# +# Test push as well. +# Should fail. +# +log "iq2 push into iq3" +$PKGDIR/bin/iqsync \ + -f $IQ2 \ + --push \ + --validate \ + --verbose \ + --sleep 100 \ + localhost:$IQ3 \ + >> $WORKDIR/iqsync.2.3.log \ + 2>&1 \ +&& die "iqsync --push --validate -f iq2 iq3 (existing) should have failed" + + + +$PKGDIR/bin/iqueue \ + -f $IQ3 \ + > $WORKDIR/iq3.txt \ + +log "Waiting" +sleep 10 + +for iq in $IQ1 $IQ2 $IQ3; do + $PKGDIR/bin/iqueue \ + -f $iq \ + --stats \ + +done + +pkill -P $$ +exit 0 diff --git a/test/iqueue-big-test.c b/test/iqueue-big-test.c new file mode 100644 index 0000000..201699b --- /dev/null +++ b/test/iqueue-big-test.c @@ -0,0 +1,420 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Create a big iqueue, way bigger than available memory to test page + * cache reuse. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "iqueue.h" +#include "segfault.h" +#include "atomic.h" + + + +static int ftrace_enabled_fd = -1; +static int ftrace_marker_fd = -1; + +static void +ftrace_open(void) +{ + const char * const enabled_file = "/sys/kernel/debug/tracing/tracing_enabled"; + const char * const marker_file = "/sys/kernel/debug/tracing/trace_marker"; + ftrace_enabled_fd = open(enabled_file, O_WRONLY); + if (ftrace_enabled_fd < 0) + warn("%s: Unable to open (ignored)", enabled_file); + + ftrace_marker_fd = open(marker_file, O_WRONLY); + if (ftrace_marker_fd < 0) + warn("%s: Unable to open (ignored)", marker_file); +} + + +static void +ftrace_stop(void) +{ + if (ftrace_enabled_fd < 0) + return; + + ssize_t wlen = write(ftrace_enabled_fd, "0\n", 2); + if (wlen == 2) + return; + + perror("ftrace_enabled"); + close(ftrace_enabled_fd); + ftrace_enabled_fd = -1; +} + + +static void +__attribute__((__format__(__printf__, 1, 2))) +ftrace_mark( + const char * fmt, + ... +) +{ + if (ftrace_marker_fd < 0) + return; + + char buf[1024]; + va_list ap; + va_start(ap, fmt); + int len = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + size_t wlen = write(ftrace_marker_fd, buf, len); + if (wlen > 0) + return; + + perror("ftrace_marker"); + close(ftrace_marker_fd); + ftrace_marker_fd = -1; +} + + +static sem_t prefetch_sem; +static uint64_t last_map_offset; + +static void * +prefetch_thread( + void * const iq_ptr +) +{ + iqueue_t * const iq = iq_ptr; + const uint64_t map_chunk = 1 << 29; + + while (1) + { + int rc = sem_wait(&prefetch_sem); + if (rc < 0) + { + if (errno == EINTR || errno == EAGAIN || errno == ETIMEDOUT) + continue; + break; + } + + // Map 512 MB ahead of the current location + const uint64_t mask = (1 << 29) - 1; + uint64_t map_offset = (last_map_offset + mask + 1) & ~mask; + + //TSLOGXL(TSINFO, "%s: prefetching %"PRIx64, iqueue_name(iq), map_offset); + uint64_t prefetch_time = -tsclock_getnanos(0); + + uint64_t * data = (void*)(uintptr_t) iqueue_get_data(iq, map_offset, 1); + if (!data) + break; + + for (uint64_t offset = 0 ; offset < map_chunk / 8 ; offset += 4096 / 8) + atomic_cas_64(data + offset, 0, 0); + + prefetch_time += tsclock_getnanos(0); + TSLOGXL(TSINFO, "%s: prefetched %"PRIx64" - %"PRIx64": %"PRIu64" ns", + iqueue_name(iq), map_offset, map_offset + map_chunk, prefetch_time); + } + + TSLOGXL(TSERROR, "%s: prefetch exiting at offset %"PRIu64, + iqueue_name(iq), + last_map_offset + ); + + return NULL; +} + + +int +main( + int argc, + char ** argv +) +{ + segfault_handler_install(); + ftrace_open(); + + const uint64_t report_interval = 1000e6; // ns + const uint64_t target_rate = 750000; // ops/sec + const uint64_t ns_per_op = 1e9 / target_rate; + const uint64_t spike = 1e6; // 1 ms == big spike + + const size_t msg_len = 200; + uint64_t data_len = 100ul << 30; + const uint64_t msg_count = data_len / msg_len; + + const char * const iqx_file = argc > 1 ? argv[1] : "/blink/test/huge-test.iqx"; + unlink(iqx_file); + iqueue_t * const iq = iqueue_create(iqx_file, 0, NULL, 0); + + if (!iq) + TSABORTX("%s: Unable to create", iqx_file); + + TSLOGXL(TSINFO, "%s: Finished map", iqx_file); + + iqueue_allocator_t allocator; + if (iqueue_allocator_init(iq, &allocator, 1 << 20, 1) < 0) + TSABORTX("%s: Unable to create allocator", iqx_file); + + allocator.align_mask = 32 - 1; + +if (0) { + pthread_t thread; + sem_init(&prefetch_sem, 0, 1); + last_map_offset = -1; // force immediate load of first region + if (1) pthread_create(&thread, NULL, prefetch_thread, iq); + sleep(1); +} + + pthread_t thread; + if (1) iqueue_prefetch_thread(iq, &thread); + + iqueue_id_t id = -1; + + uint64_t sum_count = 0; + uint64_t allocate_sum = 0; + uint64_t memcpy_sum = 0; + uint64_t update_sum = 0; + uint64_t allocate_max = 0; + uint64_t memcpy_max = 0; + uint64_t update_max = 0; + uint64_t allocate_spike = 0; + uint64_t memcpy_spike = 0; + uint64_t update_spike = 0; + uint64_t start = tsclock_getnanos(0); + uint64_t last_report = start; + uint64_t last_op = 0; + + while (id+1 < msg_count) + { + // Wait for the next time slice; + uint64_t allocate_start; + do { + allocate_start = tsclock_getnanos(0); + } while (allocate_start < last_op + ns_per_op); + last_op = allocate_start; + + iqueue_msg_t iqmsg; + uint64_t * const msg = iqueue_allocate(&allocator, msg_len, &iqmsg); + if (!msg) + break; + + const uint64_t offset = iqueue_msg_offset(iqmsg); + + if ((offset >> 29) != (last_map_offset >> 29)) + { + last_map_offset = offset; + sem_post(&prefetch_sem); + } + + uintptr_t msg_base = (uintptr_t) msg; + if (msg_base & 0x1F) + TSABORTX("%s: message is not aligned: %p", iqx_file, msg); + + const uint64_t memcpy_start = tsclock_getnanos(0); + + memset(msg, (++id) & 0xFF, msg_len); + msg[1] = memcpy_start; + + const uint64_t update_start = tsclock_getnanos(0); + + if (0) ftrace_mark("iqueue update %"PRIx64, offset); + + int rc = iqueue_update_be(iq, iqmsg, (iqueue_id_t*) &msg[0]); + if (rc != 0) + { + TSLOGXL(TSINFO, "%s: %p update failed: rc=%d", iqx_file, msg, rc); + break; + } + + const uint64_t validate_start = tsclock_getnanos(0); + + const uint64_t msg_id = be64toh(msg[0]); + if (msg_id != id) + TSABORTX("%s: %"PRIu64" != expected index %"PRIu64, iqx_file, msg_id, id); + + const uint64_t allocate_time = memcpy_start - allocate_start; + const uint64_t memcpy_time = update_start - memcpy_start; + const uint64_t update_time = validate_start - update_start; + allocate_sum += allocate_time; + memcpy_sum += memcpy_time; + update_sum += update_time; + sum_count++; + + if (allocate_time > allocate_max) + allocate_max = allocate_time; + if (allocate_time > spike) + { + ftrace_mark("allocate spike %"PRIu64" iqueue offset %"PRIx64"\n", + allocate_time, + offset + ); + ftrace_stop(); + allocate_spike++; + } + + if (memcpy_time > memcpy_max) + memcpy_max = memcpy_time; + if (memcpy_time > spike) + { + ftrace_mark("memcpy spike %"PRIu64" iqueue offset %"PRIx64"\n", memcpy_time, offset); + ftrace_stop(); + memcpy_spike++; + } + + if (update_time > update_max) + update_max = update_time; + if (update_time > spike) + { + ftrace_mark("update spike %"PRIu64" iqueue offset %"PRIx64"\n", update_time, offset); + ftrace_stop(); + update_spike++; + } + + const uint64_t report_delta = validate_start - last_report; + if (report_delta < report_interval) + continue; + + ftrace_mark("iqueue offset %"PRIx64"\n", offset); + + TSLOGXL(TSINFO, "%"PRIx64" %.3f%%: %.3f kops/sec %.3f MB/s:" + " allocate %"PRIu64" (max %"PRIu64", %"PRIu64" spikes)" + " memcpy %"PRIu64" (max %"PRIu64", %"PRIu64" spikes)" + " update %"PRIu64" (max %"PRIu64", %"PRIu64" spikes)" + "%s", + offset, + (id * 100.0) / msg_count, + (sum_count * 1.0e9 / 1.0e3) / report_delta, + (sum_count * msg_len * 1.0e9 / 1.0e6) / report_delta, + allocate_sum / sum_count, + allocate_max, + allocate_spike, + memcpy_sum / sum_count, + memcpy_max, + memcpy_spike, + update_sum / sum_count, + update_max, + update_spike, + allocate_spike || memcpy_spike || update_spike ? " !!!!" : "" + ); + + last_report = validate_start; + allocate_sum = memcpy_sum = update_sum = 0; + allocate_max = memcpy_max = update_max = 0; + allocate_spike = memcpy_spike = update_spike = 0; + sum_count = 0; + } + + last_report = tsclock_getnanos(0); + allocate_sum = memcpy_sum = update_sum = 0; + allocate_max = memcpy_max = update_max = 0; + allocate_spike = memcpy_spike = update_spike = 0; + sum_count = 0; + +#if 0 + TSLOGXL(TSINFO, + "%s: Added %"PRIu64" entries, expected %"PRIu64", %.2f%% waste", + iqx_file, + id, + data_len / msg_len, + 100.0 - (id * msg_len * 100.0) / data_len + ); +#endif + + // Read all of the elements and verify that they have no overlap + id = -1; + while (1) + { + const uint64_t read_start = tsclock_getnanos(0); + + size_t len; + const uint64_t * const msg = iqueue_data(iq, ++id, &len); + if (!msg) + break; + + const uint64_t validate_start = tsclock_getnanos(0); + + if (len != msg_len) + TSABORTX("%"PRIu64": bad length %zu?", id, len); + + const uint64_t msg_id = be64toh(msg[0]); + if (msg_id != id) + TSABORTX("%"PRIu64": bad id %"PRIu64"?", id, msg_id); + + // Verify that all of the message is correct + const uint8_t * const buf = (const uint8_t*) msg; + for (size_t i = 16 ; i < msg_len ; i++) + if (buf[i] != (id & 0xFF)) + TSABORTX("%"PRIu64": offset %zu bad value", id, i); + + const uint64_t validate_end = tsclock_getnanos(0); + + const uint64_t read_time = validate_start - read_start; + const uint64_t validate_time = validate_end - validate_start; + allocate_sum += read_time; + memcpy_sum += validate_time; + sum_count++; + + if (read_time > allocate_max) + allocate_max = read_time; + if (read_time > spike) + allocate_spike++; + if (validate_time > memcpy_max) + memcpy_max = validate_time; + if (validate_time > spike) + memcpy_spike++; + + const uint64_t report_delta = validate_end - last_report; + if (report_delta < report_interval) + continue; + + TSLOGXL(TSINFO, "%.3f%%: %.3f kops/sec %.3f MB/s:" + " read %"PRIu64" (max %"PRIu64", %"PRIu64" spikes)" + " memcmp %"PRIu64" (max %"PRIu64", %"PRIu64" spikes)" + "", + (id * 100.0) / msg_count, + (sum_count * 1.0e9 / 1.0e3) / report_delta, + (sum_count * msg_len * 1.0e9 / 1.0e6) / report_delta, + allocate_sum / sum_count, + allocate_max, + allocate_spike, + memcpy_sum / sum_count, + memcpy_max, + memcpy_spike + ); + + last_report = validate_start; + allocate_sum = memcpy_sum = update_sum = 0; + allocate_max = memcpy_max = update_max = 0; + allocate_spike = memcpy_spike = update_spike = 0; + sum_count = 0; + } + + TSLOGXL(TSINFO, "%s: all messages checked out ok", iqx_file); + + iqueue_close(iq); + + return 0; +} diff --git a/test/iqueue-contention-test.c b/test/iqueue-contention-test.c new file mode 100644 index 0000000..d4ac59c --- /dev/null +++ b/test/iqueue-contention-test.c @@ -0,0 +1,194 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Flood an iqueue with multiple writers to measure contended write time. + * + * Reports stats on stdout for redirection into a log file. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "tssched.h" +#include "iqueue.h" + +static volatile int go; +static iqueue_t * iq; +static const size_t msg_len = 200; +static const uint64_t write_iters = 1 << 16; + +struct thread_context +{ + int nanosleep_time; + int id; + const char * filename; + iqueue_t * iq; + pthread_t thread; + int cpu; + uint64_t total_time; +}; + + +static void * +write_thread( + void * priv +) +{ + struct thread_context * const context = priv; + const uint64_t my_id = context->cpu; + + while (true) { + context->iq = iqueue_create(context->filename, 0, NULL, 0); + if (context->iq) + break; + } + iq = context->iq; + + iqueue_allocator_t allocator; + if (iqueue_allocator_init(context->iq, &allocator, 1<<16, 1) < 0) + TSABORTX("Unable to create iqueue allocator"); + + while (!go) + ; + + uint64_t last_delta = 0; + uint64_t msg[msg_len / sizeof(uint64_t)]; + context->total_time = 0; + + for (uint64_t iter = 0 ; iter < write_iters ; iter++) + { + int64_t start = tsclock_getnanos(0); + iqueue_msg_t offset; + uint64_t * const data = iqueue_allocate(&allocator, msg_len, &offset); + + if (!data) + TSABORTX("%"PRIu64": Unable to allocate message %"PRIu64, my_id, iter); + + memcpy(data, msg, msg_len); + data[0] = my_id; + data[1] = last_delta; + + if (iqueue_update(allocator.iq, offset, NULL) != 0) + TSABORTX("%"PRIu64": Unable to write message %"PRIu64, my_id, iter); + + int64_t stop = tsclock_getnanos(0); + last_delta = stop - start; + context->total_time += last_delta; + + while (tsclock_getnanos(0) < stop + context->nanosleep_time) + ; + } + + return 0; +} + + +static void +write_test( + const char * label, + int thread_count, + int * thread_cpus +) +{ + const char * filename = "/dev/shm/test3.iqx"; + unlink(filename); + + struct thread_context contexts[thread_count]; + + for (int i = 0 ; i < thread_count ; i++) + { + struct thread_context * const context = &contexts[i]; + context->iq = NULL; + context->filename = filename; + context->cpu = thread_cpus[i]; + context->nanosleep_time = 0; + + pthread_create(&context->thread, NULL, write_thread, context); + tssched_set_thread_affinity(context->thread, thread_cpus[i]); + } + + go = 1; + + for (int i = 0 ; i < thread_count ; i++) + { + struct thread_context * const context = &contexts[i]; + pthread_join(context->thread, NULL); + fprintf(stderr, "%s %d %"PRIu64" ns/write\n", + label, + context->cpu, + context->total_time / write_iters + ); + } + + uint64_t read_time = -tsclock_getnanos(0); + for (uint64_t iter = 0 ; iter < write_iters * thread_count ; iter ++) + { + size_t len; + const uint64_t * const msg = iqueue_data(iq, iter, &len); + if (!msg) + TSABORTX("Unable to read message %"PRIu64, iter); + } + read_time += tsclock_getnanos(0); + fprintf(stderr, "%s %"PRIu64" ns/read\n", + label, + read_time / (write_iters * thread_count) + ); + + for (uint64_t iter = 0 ; iter < write_iters * thread_count ; iter++) + { + size_t len; + const uint64_t * const msg = iqueue_data(iq, iter, &len); + printf("%s %"PRIu64" %"PRId64"\n", label, msg[0], msg[1]); + } + + iqueue_close(iq); + + unlink(filename); +} + + +int main(void) +{ + write_test("hyper", 2, + (int[]) { 1, 13 }); + + write_test("same-2", 2, + (int[]) { 1, 3 }); + + write_test("same-4", 4, + (int[]) { 1, 3, 5, 7 }); + + write_test("same-8", 8, + (int[]) { 1, 3, 5, 7, 9, 11, 13, 15 }); + + write_test("cross-2", 2, + (int[]) { 1, 2 }); + + write_test("cross-4", 4, + (int[]) { 1, 2, 3, 4 }); + + write_test("cross-8", 8, + (int[]) { 1, 2, 3, 4, 5, 6, 7, 8 }); + + write_test("cross-16", 16, + (int[]) { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }); + + return 0; +} diff --git a/test/iqueue-dc-test.c b/test/iqueue-dc-test.c new file mode 100644 index 0000000..174b464 --- /dev/null +++ b/test/iqueue-dc-test.c @@ -0,0 +1,154 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Test the DC wrapper for iqueue processing. + * + * If no DC file is given on the command line, a fake one will be created + * and an iqueue will be read from it. + */ +#include "twosigma.h" +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "iqueue-dc.h" +#include "segfault.h" + + +static const uint64_t row_count = 1 << 20; + + +static void +write_dc_file( + const char * dc_file +) +{ + static const tsutil_dcat_type_t types[] = { + { "c1", TSUTIL_DCAT_SHORT_TYPE, -1 }, + { "v1", TSUTIL_DCAT_BUF_TYPE, -1 }, + { "c2", TSUTIL_DCAT_LONG_TYPE, -1 }, + { "v2", TSUTIL_DCAT_BUF_TYPE, -1 }, + { 0, 0, 0 }, + }; + + tsutil_dcat_file_t * file = tsutil_dcat_write_file( + dc_file, + "", + types, + 0 + ); + if (!file) + TSABORTX("%s: Unable to write?", dc_file); + + // Write an all invalid row + tsutil_dcat_write_newrow( + file, + 0xF, + tsclock_getnanos(0) + ); + + for (uint64_t i = 1 ; i < row_count ; i++) + { + tsutil_dcat_write_newrow( + file, + i & 1, + tsclock_getnanos(0) + ); + + // Alternate the first column being valid + if ((i & 1) == 0) + tsutil_dcat_write_short(file, (uint16_t) i); + tsutil_dcat_write_buf(file, "v1 message goes here", i & 15); + tsutil_dcat_write_long(file, (uint64_t) i); + tsutil_dcat_write_buf(file, "v2 messge goes here", (~i) & 15); + } + + tsutil_dcat_close(file); +} + + +int main( + int argc, + const char ** argv +) +{ + segfault_handler_install(); + tslevel = TSDEBUG; + + const char * dc_file = "/tmp/test.dc"; + + // Create a fake file if one is not specified + if (argc <= 1) + write_dc_file(dc_file); + else + dc_file = argv[1]; + + const char * user_hdr; + const tsutil_dcat_type_t * types; + iqueue_t * const iq = iqueue_open_dc( + dc_file, + &user_hdr, + &types + ); + + if (!iq) + TSABORTX("%s: Unable to map", dc_file); + + const uint64_t entries = iqueue_entries(iq); + + if (argc <= 1 && entries != row_count) + TSABORTX("%s: Should have %"PRIu64" entries, not %"PRIu64, + dc_file, + row_count, + entries + ); + + TSLOGXL(TSINFO, "%s: Mapped %"PRIu64" entries", dc_file, entries); + + uint64_t id = 0; + uint64_t total_len = 0; + + uint64_t read_time = -tsclock_getnanos(0); + while (1) + { + size_t len; + const void * data = iqueue_data(iq, id, &len); + if (!data) + break; + + const uint64_t timestamp = be64toh(*(const uint64_t*) data); + TSLOGXL(TSDIAG, "%"PRIu64": %"PRIu64" %zu", id, timestamp, len); + + id++; + total_len += len; + __USE(timestamp); + } + read_time += tsclock_getnanos(0); + + TSLOGXL(TSINFO, "%s: Average row size: %"PRIu64" bytes, %"PRIu64" ns/row", + dc_file, + id ? total_len / id : 0, + id ? read_time / id : 0 + ); + + iqueue_close(iq); + + if (argc <= 1) + unlink(dc_file); + + return 0; +} diff --git a/test/iqueue-latency-test.c b/test/iqueue-latency-test.c new file mode 100644 index 0000000..dbbd1fa --- /dev/null +++ b/test/iqueue-latency-test.c @@ -0,0 +1,206 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "tssched.h" +#include "iqueue.h" +#include "segfault.h" + + +static volatile int go; +static uint64_t msg_len = 200; +static const uint64_t latency_iters = 1 << 20; + +static void * +latency_pong( + void * priv +) +{ + iqueue_t * const iq = priv; + iqueue_allocator_t allocator; + if (iqueue_allocator_init(iq, &allocator, 1 << 20, 1) < 0) + TSABORTX("Failed to create allocator"); + + while (!go) + ; + + for (uint64_t iter = 0 ; iter < latency_iters ; iter += 2) + { + if (iqueue_status_wait(iq, iter, -1) != IQUEUE_STATUS_HAS_DATA) + TSABORTX("Failed to get message %"PRIu64, iter); + + size_t len; + const uint64_t * const msg = iqueue_data(iq, iter, &len); + + char data[len]; + memcpy(data, msg, len); + const uint64_t now = tsclock_getnanos(0); + + iqueue_msg_t offset; + uint64_t * const reply = iqueue_allocate(&allocator, len, &offset); + if (!reply) + TSABORTX("Failed to allocate reply %"PRIu64, iter); + + TSLOGXL(TSDIAG, "%"PRIu64": %p => %p @ %"PRIx64, + iter, + msg, + reply, + iqueue_msg_offset(offset) + ); + + memcpy(reply, msg, len); + reply[0] = now - msg[0]; + + if (iqueue_update(allocator.iq, offset, NULL) != 0) + TSABORTX("Failed to update reply %"PRIu64, iter); + } + + return NULL; +} + + +static void * +latency_ping( + void * priv +) +{ + iqueue_t * const iq = priv; + iqueue_allocator_t allocator; + if (iqueue_allocator_init(iq, &allocator, 1 << 20, 1) < 0) + TSABORTX("Failed to create allocator"); + + while (!go) + ; + + uint64_t data[msg_len / sizeof(uint64_t)]; + memset(data, 0xA5, sizeof(data)); + + uint64_t total_time = 0; + const uint64_t warmup_iters = 1001; + + for (uint64_t iter = 1 ; iter < latency_iters ; iter += 2) + { + uint64_t now = tsclock_getnanos(0); + if (iter == warmup_iters) + total_time = -now; + + size_t len = sizeof(data); + iqueue_msg_t offset; + uint64_t * const msg = iqueue_allocate(&allocator, len, &offset); + if (!msg) + TSABORTX("Failed to allocate msg %"PRIu64, iter); + TSLOGXL(TSDIAG, "%"PRIu64": %p @ %"PRIx64, iter-1, msg, iqueue_msg_offset(offset)); + + memcpy(msg, data, len); + msg[0] = now; + + uint64_t id; + if (iqueue_update(allocator.iq, offset, &id) != 0) + TSABORTX("Failed to update msg %"PRIu64, iter); + if (id != iter - 1) + TSABORTX("Iter %"PRIu64" posted to slot %"PRIu64"!", iter-1, id); + + if (iqueue_status_wait(iq, iter, -1) != IQUEUE_STATUS_HAS_DATA) + TSABORTX("Failed to get message %"PRIu64, iter); + const uint64_t * const reply = iqueue_data(iq, iter, &len); + + if (len != sizeof(data)) + TSABORTX("Got bad size! id %"PRIu64" expected %zu got %zu", iter, sizeof(data), len); + if (memcmp(data+1, reply+1, len - sizeof(*data)) != 0) + TSABORTX("Reply mismatch!"); + } + + total_time += tsclock_getnanos(0); + TSLOGXL(TSINFO, + "%"PRIu64" bytes average one-way latency: %"PRIu64" ns", + sizeof(data), + total_time / (latency_iters - warmup_iters) + ); + + return NULL; +} + + +static void +latency_test( + const char * label, + int reader_cpu, + int writer_cpu +) +{ + const char * file = "/dev/shm/test3.iqx"; + unlink(file); + + iqueue_t * const iq = iqueue_create(file, 0, NULL, 0); + if (!iq) + TSABORT("Unable to create iqueue!"); + + pthread_t threads[2]; + + // One-way latency test + pthread_create(&threads[0], NULL, latency_ping, iq); + pthread_create(&threads[1], NULL, latency_pong, iq); + tssched_set_thread_affinity(threads[0], reader_cpu); + tssched_set_thread_affinity(threads[1], writer_cpu); + + TSLOGXL(TSINFO, "Reader cpu %d writer cpu %d", reader_cpu, writer_cpu); + go = 1; + pthread_join(threads[0], NULL); + pthread_join(threads[1], NULL); + +#if 0 + for (uint64_t iter = 8193 ; iter < latency_iters ; iter += 2) + { + size_t len; + const uint64_t * const msg = iqueue_data(iq, iter, &len); + printf("%s %"PRIu64" %"PRIu64"\n", label, msg_len, *msg); + } +#else + // Compute a histagram of the data set + for (uint64_t iter = 8193 ; iter < latency_iters ; iter += 2) + { + size_t len; + const uint64_t * const msg = iqueue_data(iq, iter, &len); + printf("%s %"PRIu64" %"PRIu64"\n", label, msg_len, *msg); + } +#endif + + iqueue_close(iq); +} + + +int main(void) +{ + segfault_handler_install(); + + uint64_t msg_lens[] = { 8, 64, 128, 200, 512, 1024 }; + + for (unsigned i = 0 ; i < __arraycount(msg_lens) ; i++) + { + msg_len = msg_lens[i]; + latency_test("hyper", 1, 13); + latency_test("same", 1, 3); + latency_test("cross", 1, 2); + } + + return 0; +} diff --git a/test/iqueue-overhead-test.c b/test/iqueue-overhead-test.c new file mode 100644 index 0000000..7db4408 --- /dev/null +++ b/test/iqueue-overhead-test.c @@ -0,0 +1,165 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Report the overhead of writing to an uncontested iqueue with + * and without locking enabled. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "tssched.h" +#include "iqueue.h" +#include "segfault.h" + + +static void +histogram( + const char * const label, + const uint64_t * const samples, + const uint64_t count +) +{ + uint64_t sum = 0; + uint64_t sum2 = 0; + uint64_t min = -1; + uint64_t max = 0; + + for (uint64_t i = 0 ; i < count ; i++) + { + const uint64_t x = samples[i]; + sum += x; + sum2 += x * x; + if (x > max) + max = x; + if (x < min) + min = x; + } + + const uint64_t avg = sum / count; + + printf("%s: min=%"PRIu64" avg=%"PRIu64" max=%"PRIu64" var=%"PRIu64"\n", + label, + min, + avg, + max, + sum2 / count - avg * avg + ); +} + + +static void +overhead_test( + const char * const label, + const char * const iqfile, + const uint64_t msg_count, + const uint64_t msg_len, + const int lock_type +) +{ + unlink(iqfile); + iqueue_t * const iq = iqueue_create(iqfile, 0, NULL, 0); + if (!iq) + TSABORTX("%s: Unable to create", iqfile); + + if ((lock_type & 1) && iqueue_mlock(iq) == -1) + { + TSLOGXL(TSERROR, "%s: Unable to lock", iqfile); + return; + } else + if ((lock_type & 2) && iqueue_prefetch_thread(iq, NULL) == -1) + TSABORTX("%s: Unable to start prefetch", iqfile); + + iqueue_allocator_t allocator; + if (iqueue_allocator_init(iq, &allocator, 1 << 16, 1) < 0) + TSABORTX("%s: Unable to create allocator", iqfile); + + // Limit the write speed to 512 MB/s + const uint64_t delay = 1.0e9 * msg_len / (512 << 20); + + uint64_t * const update_times = calloc(msg_count, sizeof(*update_times)); + + for (uint64_t i = 0 ; i < msg_count ; i++) + { + const uint64_t start = tsclock_getnanos(0); + iqueue_msg_t iqmsg; + uint8_t * const msg = iqueue_allocate(&allocator, msg_len, &iqmsg); + if (!msg) + TSABORTX("%s: Unable to allocate %"PRIu64, iqfile, i); + + memset(msg, i & 0xFF, msg_len); + + if (iqueue_update(iq, iqmsg, NULL) != 0) + TSABORTX("%s: Unable to update %"PRIu64, iqfile, i); + + const uint64_t end = tsclock_getnanos(0); + + update_times[i] = end - start; + + // delay so that we limit our maximum write speed to 200 MB/s + while ((uint64_t) tsclock_getnanos(0) < start + delay) + continue; + } + + iqueue_close(iq); + +#if 0 + for (uint64_t i = 0 ; i < msg_count ; i++) + printf("%"PRIu64",%s,%"PRIu64",%"PRIu64",%"PRIu64",%"PRIu64"\n", + i, + label, + msg_len, + alloc_times[i], + memset_times[i], + update_times[i] + ); +#else + histogram(label, update_times, msg_count); +#endif + + free(update_times); +} + + +int +main( + int argc, + char ** argv +) +{ + segfault_handler_install(); + + const char * const iqfile = argc > 1 ? argv[1] : "/tmp/overhead.iqx"; + const uint64_t msg_len = 200; + //const uint64_t msg_count = (2048ull << 20) / msg_len; + const uint64_t msg_count = (512ull << 20) / msg_len; + + printf("iter,test,len,alloc,memset,update\n"); + + overhead_test("mlock", iqfile, msg_count, msg_len, 1); + overhead_test("mlock+prefetch", iqfile, msg_count, msg_len, 3); + overhead_test("normal", iqfile, msg_count, msg_len, 0); + overhead_test("prefetch", iqfile, msg_count, msg_len, 2); + //overhead_test("prefetch", iqfile, msg_count, msg_len, 1); + + //unlink(iqfile); + + return 0; +} diff --git a/test/iqueue-read-contention-test.c b/test/iqueue-read-contention-test.c new file mode 100644 index 0000000..a2bf840 --- /dev/null +++ b/test/iqueue-read-contention-test.c @@ -0,0 +1,230 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Multiple readers on an iqueue with a single writer, testing + * contended read time. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include "tslog.h" +#include "tsclock.h" +#include "tssched.h" +#include "iqueue.h" + +static volatile int go; +static const size_t msg_len = 200; +static const uint64_t write_iters = 1 << 16; + +struct thread_context +{ + // Shared. + iqueue_t *iq; + pthread_t thread; + const char *filename; + int cpu; + uint64_t total_time; + + // Used by writer only. + int nanosleep_time; + + // Used by reader only. + bool ready; +}; + + +static void * +write_thread( + void *priv) +{ + struct thread_context *const context = priv; + const uint64_t my_id = context->cpu; + + while (true) { + context->iq = iqueue_create(context->filename, 0, NULL, 0); + if (!context->iq) { + TSLOGX(TSERROR, "Unable to create iqueue!"); + } else { + break; + } + } + + iqueue_allocator_t allocator; + if (iqueue_allocator_init(context->iq, &allocator, 1<<16, 1) < 0) + TSABORTX("Unable to create iqueue allocator"); + + // Signal reads to open iqueue. + go = 1; + + // Wait for readers to be done. + while (go != 2); + + // Start writing. + uint64_t last_delta = 0; + for (uint64_t iter = 0; iter < write_iters; ++iter) + { + int64_t start = tsclock_getnanos(0); + iqueue_msg_t offset; + uint64_t * const data = iqueue_allocate(&allocator, msg_len, &offset); + + if (!data) + break; + data[0] = my_id; + data[1] = last_delta; + data[2] = tsclock_getnanos(0); + + if (iqueue_update(allocator.iq, offset, NULL) != 0) + TSABORTX("%"PRIu64": Unable to write message %"PRIu64, my_id, iter); + + int64_t stop = tsclock_getnanos(0); + last_delta = stop - start; + context->total_time += last_delta; + + while (tsclock_getnanos(0) < stop + context->nanosleep_time); + } + + iqueue_close(context->iq); + + return 0; +} + +static void * +read_thread( + void *priv) +{ + struct thread_context * const context = priv; + + // Wait for writer to create iqueue. + while (go != 1); + + // Open iqueue for reading and signal that we're ready. + context->iq = iqueue_open(context->filename, false); + context->ready = true; + + // Wait for all readers to be ready. + while (go != 2); + + uint64_t readed = 0; // Because read is a keyword... + while (readed < write_iters) { + size_t len; + const uint64_t *const msg = iqueue_data(context->iq, readed, &len); + if (msg) { + ++readed; + uint64_t delta = tsclock_getnanos(0) - msg[2]; + context->total_time += delta; + } + } + fprintf(stderr, "Average read for %d - %"PRIu64" ns\n", + context->cpu, + context->total_time / write_iters); + + iqueue_close(context->iq); + + return 0; +} + +static void +read_test( + const char *label, + int nanosleep_time, + int thread_count, + int *thread_cpus) +{ + const char * filename = "/dev/shm/test3.iqx"; + unlink(filename); + + struct thread_context contexts[thread_count]; + + for (int i = 0; i < thread_count; i++) { + struct thread_context * const context = &contexts[i]; + context->iq = NULL; + context->filename = filename; + context->cpu = thread_cpus[i]; + context->nanosleep_time = nanosleep_time; + context->total_time = 0; + context->ready = false; + + pthread_create(&context->thread, NULL, i == 0 ? write_thread : read_thread, context); + tssched_set_thread_affinity(context->thread, thread_cpus[i]); + } + + // Wait for all readers to be ready. + while (true) { + bool all_done = true; + for (int i = 1; i < thread_count; ++i) { + if (!contexts[i].ready) { + all_done = false; + break; + } + } + if (all_done) break; + } + + // Signal to start. + go = 2; + + // Reporting. + uint64_t total_time = 0; + for (int i = 0; i < thread_count; ++i) + { + struct thread_context * const context = &contexts[i]; + pthread_join(context->thread, NULL); + if (i == 0) { + fprintf(stderr, "%s %d %"PRIu64" ns/write\n", + label, + context->cpu, + context->total_time / write_iters + ); + } else { + total_time += context->total_time; + } + } + TSLOGX(TSINFO, "%s Average read: %.1lfns", + label, + (double)total_time / thread_count / write_iters); + + // Sometimes the file system is slow and not having this cause + // weird issues when the next writer starts up. + sleep(1); +} + +int +main( + int argc, + char **argv) +{ + if (argc != 2) { + TSLOGX(TSERROR, "usage: iqueue-read-contention-test "); + return 1; + } + + int nanosleep_time = atoi(argv[1]); + + // The first CPU is the writer. + read_test("same-8", nanosleep_time, 9, + (int[]) { 1, 3, 5, 7, 9, 11, 13, 15, 17 }); + + read_test("cross-8", nanosleep_time, 9, + (int[]) { 1, 9, 10, 11, 12, 13, 14, 15, 16 }); + + read_test("cross-16", nanosleep_time, 17, + (int[]) { 1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 30, 31, 32, 33, 34 }); + + return 0; +} diff --git a/test/iqueue-seal-test b/test/iqueue-seal-test new file mode 100644 index 0000000..363dde3 --- /dev/null +++ b/test/iqueue-seal-test @@ -0,0 +1,112 @@ +#! /bin/bash +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Global settings +DIR=$(dirname $0) +PROG=$(basename $0) +PKGDIR=$(cd $(dirname ${0})/../../..; pawd) + +# Populate the TS_*_HOME variables +eval $($(${PKGDIR}/bin/gettools)/bin/makepath -s -r ${PKGDIR}) + +PROJECT=transport +ARCH="$("${PRKDIR}/bin/machine" -l)" +WORKDIR="$(mktemp -d $DIR/iqueue-seal.XXXXXXXXXX)" +trap "rm -fr $WORKDIR" 0 1 2 3 15 + +echo Using $WORKDIR +IQ1=$WORKDIR/iq1.iqx + +die() { + echo >&2 "$@" + if [ ! -z "$APPEND_PID" ]; then + killall $APPEND_PID + fi + + rm -rf $WORKDIR + exit 1 +} + +warn() { + echo >&2 "$@" +} + +trap "die Caught signal" 1 2 3 15 +trap "rm -rf $WORKDIR" 0 + + +# This test will add data into an iqueue that will then be sealed +# to test that it can recover and keep writing + +${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ +|| die "$IQ1: Unable to create" + +perl -e '$|=1; for my $i (0..12) { print "$i\n"; sleep 1 }' \ +| ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --line \ + --follow & + +APPEND_PID=$! + +sleep 5 + +warn "Sealing $IQ1" +${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --seal \ +|| die "$IQ1: Unable to seal" + +# Verify that only the five samples are in place +${PKGDIR}/bin/iqueue -f $IQ1 > $WORKDIR/iq1.txt +cat < $WORKDIR/iq1.txt +cat <&2 "$@" + if [ ! -z "$APPEND_PID" ]; then + killall $APPEND_PID + fi + + rm -rf $WORKDIR + exit 1 +} + +warn() { + echo >&2 "$@" +} + +trap "die Caught signal" 1 2 3 15 +trap "rm -rf $WORKDIR" 0 + + +# This test will add data into an iqueue that will then be sealed +# to test that it can recover and keep writing + +${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --create \ +|| die "$IQ1: Unable to create" + +perl -e '$|=1; for my $i (0..12) { print "$i\n"; sleep 1 }' \ +| ${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --line \ + --follow & + +APPEND_PID=$! + +sleep 5 + +warn "Sealing $IQ1" +${PKGDIR}/bin/iqueue \ + -f $IQ1 \ + --seal \ +|| die "$IQ1: Unable to seal" + +# Verify that only the five samples are in place +${PKGDIR}/bin/iqueue -f $IQ1 > $WORKDIR/iq1.txt +cat < $WORKDIR/iq1.txt +cat < +#include +#include +#include "tslog.h" +#include "iqueue.h" + +TS_ADD_TEST(refilling_allocator) +{ + char *dir = ts_test_resource_get_dir("refilling_allocator"); + char *iq_file = NULL; + if (asprintf(&iq_file, "%s/align-test.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + iqueue_t * const iq = iqueue_create(iq_file, 0, NULL, 0); + TS_TEST_ASSERT(iq); + + iqueue_allocator_t allocator; + TS_TEST_ASSERT(iqueue_allocator_init(iq, &allocator, 1 << 16, 1) == 0); + + allocator.align_mask = 32 - 1; + + // Populate the iqueue with aligned messages + const size_t alloc_len = 9403; + const size_t msg_len = 1003; + iqueue_id_t id = -1; + const uint64_t msg_count = 1 << 20; + + while (++id < msg_count) + { + iqueue_msg_t iqmsg; + uint64_t * const msg = iqueue_allocate(&allocator, alloc_len, &iqmsg); + TS_TEST_ASSERT(msg); + + uintptr_t msg_base = (uintptr_t) msg; + TS_TEST_ASSERT((msg_base & 0x1F) == 0); + + memset(msg, id & 0xFF, alloc_len); + + TS_TEST_ASSERT(iqueue_realloc(&allocator, &iqmsg, msg_len) >= 0); + TS_TEST_ASSERT(iqueue_update_be(iq, iqmsg, (iqueue_id_t*) &msg[0]) == 0); + + const uint64_t msg_id = be64toh(msg[0]); + TS_TEST_EQUALS(msg_id, id); + + TSLOGXL(TSDEBUG, "%"PRIu64": %p", msg[0], msg); + } + + TSLOGXL(TSINFO, + "%s: Added %"PRIu64" entries", + iq_file, + id + ); + + // Read all of the elements and verify that they have no overlap + id = -1; + while (1) + { + size_t len; + const uint64_t * const msg = iqueue_data(iq, ++id, &len); + if (id == msg_count) { + TS_TEST_ASSERT(msg == 0); + break; + } + + TS_TEST_ASSERT(msg); + + TS_TEST_EQUALS(len, msg_len); + + const uint64_t msg_offset = (uintptr_t) msg; + TS_TEST_ASSERT((msg_offset & allocator.align_mask) == 0); + + const uint64_t msg_id = be64toh(msg[0]); + TS_TEST_EQUALS(msg_id, id); + + const uint8_t * const buf = (const uint8_t*) msg; + for (size_t i = 8 ; i < msg_len ; i++) { + TS_TEST_ASSERT(buf[i] == (id & 0xFF)); + } + } + + TSLOGXL(TSINFO, "%s: all messages checked out ok", iq_file); + + iqueue_close(iq); + unlink(iq_file); + + return true; +} + +TS_ADD_TEST(nonrefilling_allocator) +{ + char *dir = ts_test_resource_get_dir("nonrefilling_allocator"); + char *iq_file = NULL; + if (asprintf(&iq_file, "%s/nonrefilling-allocator-test.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + const size_t min_alloc_len = 1; + const size_t max_alloc_len = 64; + const unsigned min_message_count = 0; + const unsigned max_message_count = 8; + + iqueue_t * const iq = iqueue_create(iq_file, 0, NULL, 0); + TS_TEST_ASSERT(iq); + + iqueue_id_t id = -1; + for (size_t alloc_len = min_alloc_len; alloc_len < max_alloc_len; alloc_len++) + { + for (unsigned message_count = min_message_count; message_count < max_message_count; message_count++) + { + const size_t bulk_len = alloc_len * message_count; + iqueue_allocator_t allocator; + TS_TEST_ASSERT(iqueue_allocator_init(iq, &allocator, bulk_len, 0) == 0); + + for (unsigned i = 0; i < message_count; i++) + { + iqueue_msg_t iqmsg; + void * const msg = iqueue_allocate(&allocator, alloc_len, &iqmsg); + TS_TEST_ASSERT(msg); + + memset(msg, ++id & 0xFF, alloc_len); + + iqueue_id_t msg_id; + TS_TEST_ASSERT(iqueue_update(iq, iqmsg, &msg_id) == 0); + + TS_TEST_EQUALS(msg_id, id); + + TSLOGXL(TSDEBUG, "%"PRIu64": %p", msg_id, msg); + } + + iqueue_msg_t iqmsg; + TS_TEST_ASSERT(iqueue_allocate(&allocator, 1, &iqmsg) == NULL); + } + } + + // Read all of the elements and verify that they have no overlap + id = -1; + for (size_t msg_len = min_alloc_len; msg_len < max_alloc_len; msg_len++) + { + for (unsigned message_count = min_message_count; message_count < max_message_count; message_count++) + { + for (unsigned i = 0; i < message_count; i++) + { + size_t len; + const void * const msg = iqueue_data(iq, ++id, &len); + TS_TEST_ASSERT(msg); + + TS_TEST_EQUALS(len, msg_len); + + const uint8_t * const buf = (const uint8_t*) msg; + for (size_t byte = 0 ; byte < msg_len ; byte++) { + TS_TEST_EQUALS(buf[byte], (id & 0xFF)); + } + } + } + } + + TSLOGXL(TSINFO, "%s: all messages checked out ok", iq_file); + + iqueue_close(iq); + unlink(iq_file); + + return true; +} diff --git a/test/iqueue_reopen_ctest.c b/test/iqueue_reopen_ctest.c new file mode 100644 index 0000000..8b73ac2 --- /dev/null +++ b/test/iqueue_reopen_ctest.c @@ -0,0 +1,114 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Test that the sealing and re-opening functions work + */ +#include "twosigma.h" + +#include "ctest.h" +#include "ctest_resource.h" + +#include +#include +#include +#include +#include "tslog.h" +#include "iqueue.h" + +TS_ADD_TEST(test) +{ + tslevel = TSDIAG; + + char *dir = ts_test_resource_get_dir("iqueue_reopen"); + char *iq_file = NULL; + if (asprintf(&iq_file, "%s/reopen.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + iqueue_t * const iq = iqueue_create(iq_file, 0, NULL, 0); + TS_TEST_ASSERT(iq); + + const size_t msg_len = 16; + uint64_t id; + + { + iqueue_msg_t iqmsg; + uint64_t * const msg = iqueue_allocate_raw(iq, msg_len, &iqmsg); + TS_TEST_ASSERT(msg); + + msg[0] = getpid(); + + TS_TEST_ASSERT(iqueue_update(iq, iqmsg, &id) == 0); + TS_TEST_EQUALS(id, 0); + + TSLOGXL(TSINFO, "%s: posted %"PRIu64" at %p", iq_file, id, msg); + } + + const uint64_t creation = iqueue_creation(iq); + TS_TEST_ASSERT(iqueue_reopen(iq) == 0); + TS_TEST_EQUALS(iqueue_creation(iq), creation); + + { + size_t len; + const uint64_t * const msg = iqueue_data(iq, id, &len); + TS_TEST_ASSERT(msg); + + TS_TEST_EQUALS(len, msg_len); + + TS_TEST_EQUALS(msg[0], (uint64_t) getpid()); + + TSLOGXL(TSINFO, "%s: read %"PRIu64" at %p", iq_file, id, msg); + } + + TSLOGXL(TSINFO, "%s: reopen worked", iq_file); + + const iqueue_id_t seal_id = iqueue_entries(iq); + TS_TEST_ASSERT(iqueue_archive(iq, seal_id) == 0); + + int rc = iqueue_reopen(iq); + TS_TEST_EQUALS(rc, -1); + TS_TEST_EQUALS(errno, ENOENT); + + // Success (in that we failed to reopen) + iqueue_t * const new_iq = iqueue_create(iq_file, 0, NULL, 0); + TS_TEST_ASSERT(new_iq); + + rc = iqueue_reopen(iq); + TS_TEST_EQUALS(rc, 0); + + TS_TEST_ASSERT(iqueue_creation(iq) != creation); + TS_TEST_EQUALS(iqueue_creation(iq), iqueue_creation(new_iq)); + + TS_TEST_EQUALS(iqueue_entries(iq), 0); + + const size_t namelen = strlen(iq_file) + 32; + char * archive_file = calloc(1, namelen); + TS_TEST_ASSERT(archive_file); + + snprintf(archive_file, namelen, + "%s.%"PRIu64, + iq_file, + creation + ); + + iqueue_close(iq); + iqueue_close(new_iq); + unlink(iq_file); + unlink(archive_file); + + TSLOGXL(TSINFO, "Test passed"); + + return true; +} diff --git a/test/iqueue_symlink_ctest.c b/test/iqueue_symlink_ctest.c new file mode 100644 index 0000000..1b05308 --- /dev/null +++ b/test/iqueue_symlink_ctest.c @@ -0,0 +1,87 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* + * Test that symlink resolution for iqueue create works. + */ +#include "twosigma.h" +#include "tslog.h" + +#include + +#include "ctest.h" +#include "ctest_resource.h" + +#include "iqueue.h" + +TS_ADD_TEST(test) +{ + tslevel = TSDIAG; + char *dir = ts_test_resource_get_dir("iqueue_symlink"); + + char *symlink0 = NULL; + if (asprintf(&symlink0, "%s/symlink0", dir) == -1) + TSABORTX("failed to asprintf"); + + char *symlink1 = NULL; + if (asprintf(&symlink1, "%s/symlink1", dir) == -1) + TSABORTX("failed to asprintf"); + + char *dest_iqx = NULL; + if (asprintf(&dest_iqx, "%s/dest.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + // Case 1: symlink1 -> symlink2 -> dest_iqx. + // This should create a new iqueue in dest_iqx. + if (symlink(symlink1, symlink0) == -1) + TSABORT("failed to symlink"); + + if (symlink(dest_iqx, symlink1) == -1) + TSABORT("failed to symlink"); + + iqueue_t * const iq1 = iqueue_create(symlink0, 0, NULL, 0); + TS_TEST_ASSERT(iq1); + TS_TEST_ASSERT(access(dest_iqx, F_OK) == 0); + + iqueue_append(iq1, "test", 5); // Add an entry for the check later. + iqueue_close(iq1); + + // Case 1: symlink1 -> symlink2 -> dest_iqx. + // This should reopen the iqueue created in the previous step. + iqueue_t * const iq2 = iqueue_create(symlink0, 0, NULL, 0); + TS_TEST_ASSERT(iq2); + TS_TEST_ASSERT(iqueue_entries(iq2) == 1); + iqueue_close(iq2); + + unlink(dest_iqx); + unlink(symlink1); + unlink(symlink0); + + // Case 3: symlink1 -> symlink2 -> symlink1. + // This should detect the loop and fail. + if (symlink(symlink1, symlink0) == -1) + TSABORT("failed to symlink"); + + if (symlink(symlink0, symlink1) == -1) + TSABORT("failed to symlink"); + + iqueue_t * const iq3 = iqueue_create(symlink0, 0, NULL, 0); + TS_TEST_ASSERT(!iq3); + unlink(symlink1); + unlink(symlink0); + + TSLOGX(TSINFO, "Test passed"); + return true; +} diff --git a/test/iqueue_try_update_ctest.c b/test/iqueue_try_update_ctest.c new file mode 100644 index 0000000..175de5a --- /dev/null +++ b/test/iqueue_try_update_ctest.c @@ -0,0 +1,104 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Verify that iqueue_try_update() returns the correct values. + */ +#include "twosigma.h" + +#include "tslog.h" +#include "ctest.h" +#include "ctest_resource.h" + +#include +#include +#include +#include "iqueue.h" +#include "tsclock.h" + + +TS_ADD_TEST(test) +{ + char *dir = ts_test_resource_get_dir("iqueue_try_update"); + char *filename = NULL; + if (asprintf(&filename, "%s/test.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + iqueue_t * const iq = iqueue_create(filename, 0, NULL, 0); + TS_TEST_ASSERT(iq); + + shash_t * const table = iqueue_writer_table(iq, 0, 1); + TS_TEST_ASSERT(table); + + shash_entry_t * const writer = shash_insert(table, 9999, 1234); + TS_TEST_ASSERT(writer); + + iqueue_writer_update(table, writer, 1233); + iqueue_writer_update(table, writer, 1234); + iqueue_writer_update(table, writer, 1235); + iqueue_writer_update(table, writer, 0); + iqueue_writer_update(table, writer, 1); + iqueue_writer_update(table, writer, 2); + + TS_TEST_EQUALS(iqueue_entries(iq), 0); + + iqueue_append(iq, "zero", 5); + iqueue_append(iq, "one", 4); + iqueue_append(iq, "two", 4); + iqueue_append(iq, "three", 6); + + iqueue_msg_t iqmsg; + char * const msg = iqueue_allocate_raw(iq, 5, &iqmsg); + TS_TEST_ASSERT(msg); + + msg[0] = 'f'; + msg[1] = 'o'; + msg[2] = 'u'; + msg[3] = 'r'; + msg[4] = '\0'; + + + // Try to append at index 4, which should be ok + TS_TEST_EQUALS(iqueue_try_update(iq, 4, iqmsg), 0); + + // Try to append at index 4 again, which should fail + TS_TEST_EQUALS(iqueue_try_update(iq, 4, iqmsg), IQUEUE_STATUS_HAS_DATA); + + // Try to append at index 6, which would leave a hole + TS_TEST_EQUALS(iqueue_try_update(iq, 6, iqmsg), IQUEUE_STATUS_INDEX_INVALID); + + // Seal the iqueue at an invalid index + TS_TEST_EQUALS(iqueue_try_seal(iq, 6), IQUEUE_STATUS_INDEX_INVALID); + + // Seal it too early + TS_TEST_EQUALS(iqueue_try_seal(iq, 4), IQUEUE_STATUS_HAS_DATA); + + // Seal it just right + TS_TEST_EQUALS(iqueue_try_seal(iq, 5), IQUEUE_STATUS_OK); + + // Verify that iqueue_try_update fails now + TS_TEST_EQUALS(iqueue_try_update(iq, 4, iqmsg), IQUEUE_STATUS_HAS_DATA); + + TS_TEST_EQUALS(iqueue_try_update(iq, 5, iqmsg), IQUEUE_STATUS_SEALED); + + // Try to append at index 7, which would leave a hole + TS_TEST_EQUALS(iqueue_try_update(iq, 7, iqmsg), IQUEUE_STATUS_INDEX_INVALID); + + // Try to append at index 6, which would be ok, except that the + // iqueue is sealed + TS_TEST_EQUALS(iqueue_try_update(iq, 6, iqmsg), IQUEUE_STATUS_INDEX_INVALID); + + return true; +} diff --git a/test/iqueue_writer_ctest.c b/test/iqueue_writer_ctest.c new file mode 100644 index 0000000..9d633dd --- /dev/null +++ b/test/iqueue_writer_ctest.c @@ -0,0 +1,63 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Verify that the iqueue writer and heartbeat functions work. + */ +#include "twosigma.h" + +#include "tslog.h" +#include "ctest.h" +#include "ctest_resource.h" + +#include +#include +#include +#include +#include "iqueue.h" +#include "tsclock.h" + +TS_ADD_TEST(test) +{ + char *dir = ts_test_resource_get_dir("iqueue_writer"); + char *filename = NULL; + if (asprintf(&filename, "%s/writer.iqx", dir) == -1) + TSABORTX("failed to asprintf"); + + iqueue_t * const iq = iqueue_create(filename, 0, NULL, 0); + TS_TEST_ASSERT(iq); + + shash_t * const table = iqueue_writer_table(iq, 0, 1); + TS_TEST_ASSERT(table); + + shash_entry_t * const writer = shash_insert(table, 9999, 1234); + TS_TEST_ASSERT(writer); + + TS_TEST_ASSERT(iqueue_writer_update(table, writer, 1233) == 0); + TS_TEST_ASSERT(iqueue_writer_update(table, writer, 1234) == 0); + TS_TEST_ASSERT(iqueue_writer_update(table, writer, 1235) == 1); + TS_TEST_ASSERT(iqueue_writer_update(table, writer, (uint64_t) -1) == 1); + TS_TEST_ASSERT(iqueue_writer_update(table, writer, 1) == 1); + TS_TEST_ASSERT(iqueue_writer_update(table, writer, 2) == 1); + + shash_entry_t * const writer2 = shash_insert_or_get(table, 9999, 1237); + TS_TEST_ASSERT(writer2); + TS_TEST_EQUALS(writer, writer2); + + TS_TEST_EQUALS(writer2->value, 2); + + unlink(filename); + return true; +} diff --git a/test/nop-ssh b/test/nop-ssh new file mode 100755 index 0000000..512c2fe --- /dev/null +++ b/test/nop-ssh @@ -0,0 +1,18 @@ +#!/bin/sh +# Copyright 2021 Two Sigma Open Source, LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# nop ssh command +# Ignore the host argument and invoke the remainder of the command +shift +exec $@ diff --git a/test/shash/shash_ctest.c b/test/shash/shash_ctest.c new file mode 100644 index 0000000..4aed3dc --- /dev/null +++ b/test/shash/shash_ctest.c @@ -0,0 +1,184 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "twosigma.h" + +#include "tslog.h" +#include "ctest.h" + +#include "shash.h" +#include "tsclock.h" +#include +#include + +static volatile int go = 0; +static const uint64_t max_iters = 1 << 24; +static const size_t buf_size = 4096; + +static void * +thread_test( + void * const sh_ptr +) +{ + shash_t * const sh = sh_ptr; + uint64_t now = tsclock_getnanos(0); + if (shash_insert(sh, now, now) == NULL) + TSABORTX("unable to insert key"); + + while (!go) + ; + + shash_entry_t * e = shash_insert(sh, 1234, 0); + if (e) + { + TSLOGXL(TSINFO, "I won the entry"); + } else { + e = shash_get(sh, 1234); + if (!e) + TSABORTX("no entry?"); + } + + uint64_t tries = 0; + const uint64_t start_time = tsclock_getnanos(0); + + for (unsigned i = 0 ; i < max_iters ; i++) + { + uint64_t val; + do { + val = e->value; + tries++; + } while (shash_update(sh, e, val, val+1) == 0); + } + + const uint64_t end_time = tsclock_getnanos(0); + const uint64_t delta_time = end_time - start_time; + + TSLOGXL(TSINFO, "%"PRIu64" tries, %.2f avg tries/iter, %"PRIu64" ns/try", + tries, + tries / (double) max_iters, + delta_time / tries + ); + + return NULL; +} + + +static void +test_threaded( + void * const buf, + const int shared, + const unsigned max_threads +) +{ + pthread_t threads[max_threads]; + shash_t * hashes[max_threads]; + if (shared) + hashes[0] = shash_create(buf, buf_size, 0); + + for (unsigned i = 0 ; i < max_threads; i++) + { + shash_t * sh; + if (!shared) + sh = hashes[i] = shash_create(buf, buf_size, 0); + else + sh = hashes[0]; + + if (pthread_create(&threads[i], NULL, thread_test, sh) < 0) + TSABORT("thread create"); + } + + go = 1; + for (unsigned i = 0 ; i < max_threads; i++) + pthread_join(threads[i], NULL); + + shash_entry_t * const e = shash_get(hashes[0], 1234); + if (!e) + TSABORTX("no entry?"); + const uint64_t value = e->value; + if (value != max_threads * max_iters) + TSABORTX("value %"PRIu64" != expected %"PRIu64, + value, + max_threads * max_iters + ); + + if (!shash_update(hashes[0], e, value, 0)) + TSABORTX("value changed during check?"); + + TSLOGXL(TSINFO, "%s threaded test passed", shared ? "shared" : "private"); +} + + +static void +test_simple( + void * const buf +) +{ + shash_t * const sh = shash_create(buf, buf_size, 0); + if (!sh) + TSABORTX("shash_create"); + + if (shash_get(sh, 9) != NULL) + TSABORTX("get should have failed"); + shash_entry_t * e1 = shash_insert(sh, 9, 2345); + if (!e1) + TSABORTX("insert should have passed"); + if (e1->key != 9) + TSABORTX("bad key: %"PRIu64, e1->key); + if (e1->value != 2345) + TSABORTX("bad value: %"PRIu64, e1->value); + if (shash_insert(sh, 9, 23498) != NULL) + TSABORTX("insert should have failed"); + if (shash_update(sh, e1, 2344, 2346)) + TSABORTX("update should have failed"); + if (!shash_update(sh, e1, 2345, 2346)) + TSABORTX("update should have passed"); + + if (shash_get(sh, 10) != NULL) + TSABORTX("get should have failed"); + shash_entry_t * e2 = shash_insert(sh, 10, 9999); + if (!e2) + TSABORTX("insert should have passed"); + if (e2->key != 10) + TSABORTX("bad key: %"PRIu64, e2->key); + if (e2->value != 9999) + TSABORTX("bad value: %"PRIu64, e2->value); + if (!shash_update(sh, e2, 9999, 3)) + TSABORTX("update should have passed"); + if (shash_update(sh, e2, 9999, 3)) + TSABORTX("update should have failed"); + + if (shash_get(sh, 9) != e1) + TSABORTX("get did not return same entry"); + if (shash_get(sh, 10) != e2) + TSABORTX("get did not return same entry"); + + shash_destroy(sh); +} + + +TS_ADD_TEST(test) +{ + void * const buf = calloc(1, buf_size); + + test_simple(buf); + test_threaded(buf, 0, 8); // private local hashes + test_threaded(buf, 1, 8); // shared local hashes + + for (unsigned max_threads = 1 ; max_threads < 8 ; max_threads++) + test_threaded(buf, 0, max_threads); + + TSLOGXL(TSINFO, "Ok"); + return true; +} diff --git a/test/unlink_ctest.c b/test/unlink_ctest.c new file mode 100644 index 0000000..2cef2d8 --- /dev/null +++ b/test/unlink_ctest.c @@ -0,0 +1,71 @@ +/* + * Copyright 2021 Two Sigma Open Source, LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** \file + * Test what happens when an unlinked file is fstated. + */ +#include "twosigma.h" + +#include "tslog.h" +#include "ctest.h" +#include "ctest_resource.h" + +#include +#include +#include +#include +#include + +TS_ADD_TEST(test) +{ + char *dir = ts_test_resource_get_dir("unlink"); + char *file = NULL; + if (asprintf(&file, "%s/test.dat", dir) == -1) + TSABORTX("failed to asprintf"); + + const int fd = open(file, O_RDWR | O_CREAT, 0666); + TS_TEST_ASSERT(fd >= 0); + + struct stat st; + + TS_TEST_ASSERT(fstat(fd, &st) == 0); + + printf("%s: len=%zu\n", file, (size_t) st.st_size); + TS_TEST_EQUALS(st.st_size, 0); + + TS_TEST_ASSERT(ftruncate(fd, 1 << 20) == 0); + + TS_TEST_ASSERT(fstat(fd, &st) == 0); + + printf("%s: len=%zu\n", file, (size_t) st.st_size); + TS_TEST_EQUALS(st.st_size, 1 << 20); + + unlink(file); + + TS_TEST_ASSERT(fstat(fd, &st) == 0); + + printf("%s: len=%zu\n", file, (size_t) st.st_size); + TS_TEST_EQUALS(st.st_size, 1 << 20); + + TS_TEST_ASSERT(ftruncate(fd, 2 << 20) == 0); + + TS_TEST_ASSERT(fstat(fd, &st) == 0); + + printf("%s: len=%zu\n", file, (size_t) st.st_size); + TS_TEST_EQUALS(st.st_size, 2 << 20); + + close(fd); + return true; +}