From 6227bf14e7d6ed0a1720b39a88d4d5b3bd4d1f8a Mon Sep 17 00:00:00 2001 From: Spencer Baugh Date: Wed, 22 Sep 2021 21:15:05 -0400 Subject: [PATCH] old unbuildable version of iqueue Interal commit 1c3eb04 in ts_dma_transport This is added just to give some initial history, in case it's useful to understand how and why things have changed. This doesn't build on its own. --- include/iqsync.h | 173 ++++ include/iqueue.h | 656 +++++++++++++++ include/shash.h | 161 ++++ src/iqsync-main.c | 444 ++++++++++ src/iqsync.c | 1604 +++++++++++++++++++++++++++++++++++ src/iqueue-main.c | 726 ++++++++++++++++ src/iqueue.c | 2041 +++++++++++++++++++++++++++++++++++++++++++++ src/shash.c | 342 ++++++++ 8 files changed, 6147 insertions(+) create mode 100644 include/iqsync.h create mode 100644 include/iqueue.h create mode 100644 include/shash.h create mode 100644 src/iqsync-main.c create mode 100644 src/iqsync.c create mode 100644 src/iqueue-main.c create mode 100644 src/iqueue.c create mode 100644 src/shash.c diff --git a/include/iqsync.h b/include/iqsync.h new file mode 100644 index 0000000..bea5c71 --- /dev/null +++ b/include/iqsync.h @@ -0,0 +1,173 @@ +#ifndef _dma_transport_iqsync_h_ +#define _dma_transport_iqsync_h_ + +/** \file + * iqsync magic constants for both the ssh and udp versions. + */ +#include "twosigma.h" +#include +#include +#include +#include "iqueue.h" +#include "tslock.h" + + +#define IQSYNC_HANDSHAKE_MAGIC 0x495148414E440005 + +/** Send at the start of TCP connection or every few seconds by + * the multicast version. + */ +struct iqsync_handshake +{ + uint64_t magic; + uint64_t creation; + uint64_t entries; + uint64_t hdr_len; + uint8_t hdr[]; +} __attribute__((packed)); + + +/** Sent as a resend request to please replay from the + * starting index. The replay will start from the specified + * index in the remote iqueue and will be sequenced with the + * desired sequence number. + * + * This is never sent to the multicast group; unicast only. + */ +#define IQSYNC_START_MAGIC 0x4951535452540003 + +struct iqsync_start +{ + uint64_t magic; + uint64_t start_seq; + uint64_t start_index; + uint64_t flags; +} __attribute__((packed)); + + +/** Sent at the front of each packet, either unicast or multicast */ +#define IQSYNC_DATA_MAGIC 0x4951444154410004 + +struct iqsync_data +{ + uint64_t magic; + uint64_t src; // sending source + uint64_t iq_index; // sending index + uint64_t orig_src; // original source (for cycle detection) + uint64_t orig_index; // original source + uint32_t len; + uint8_t data[]; +} __attribute__((packed)); + + +/** Update of heartbeats that have changed since the last heartbeat */ +#define IQSYNC_HEARTBEAT_MAGIC 0x4951444154410005 +struct iqsync_heartbeat +{ + uint64_t magic_be64; + uint64_t count_be64; + shash_entry_t writers[]; +} __attribute__((packed)); + + +/** Shadow the actual values for an iqueue into a temporary for + * safe keeping. + */ +typedef struct +{ + const char * name; + uint64_t index; + uint64_t creation; + uint64_t entries; + uint64_t hdr_len; + uint64_t count; // msgs received + uint64_t len; // bytes received + void * hdr; +} iqsync_shadow_t; + + +/** Book keeping and options for the iqsync process. + * This is not the easiest structure to use for outside processes; + * it will likely have some significant rework if any other + * applications want to use the iqsync algorithm. + */ +typedef struct +{ + int read_fd; + int write_fd; + + int do_clone; + int do_tail; + int do_push; + int do_pull; + int do_hdr_validate; + int do_server; + int do_prefetch; + int do_syncbehind; + volatile int do_shutdown; + int usleep_time; + int verbose; + uint64_t report_interval; + uint64_t rate_limit; // in MB/s + uint64_t avg_msg_len; // in bytes + + iqueue_t * iq; + bool close_iq_on_shutdown; + + shash_t * sources; + shash_entry_t * scan_index; + + tslock_t * heartbeats_lock; + shash_t * heartbeats_hash; + shash_entry_t * heartbeats; + shash_entry_t * heartbeats_copy; + struct iqsync_heartbeat * heartbeat_msg; + unsigned heartbeats_max; + + uint64_t start_time; + uint64_t report_time; + uint64_t report_tx_count; + uint64_t report_rx_count; + uint64_t report_rx_len; + int warned_cycle; + + const char * local_cpu; + const char * remote_cpu; + pthread_t push_thread; + pthread_t pull_thread; + pthread_t stat_thread; + pthread_t prefetch_thread; + pthread_t syncbehind_thread; + + iqsync_shadow_t remote; + iqsync_shadow_t local; +} iqsync_t; + + +int +iqsync_start( + iqsync_t * iqsync +); + + +/** Wait for the push/pull threads to exit. + * \note Does not close the iqsync->iq iqueue. + */ +int +iqsync_wait( + iqsync_t * iqsync +); + + +void +iqsync_stats( + iqsync_t * iqsync +); + +const struct iqsync_data * +iqsync_data_msg( + iqueue_t * const iq, + const uint64_t offset +); + +#endif diff --git a/include/iqueue.h b/include/iqueue.h new file mode 100644 index 0000000..8e8515f --- /dev/null +++ b/include/iqueue.h @@ -0,0 +1,656 @@ +/* $TwoSigma: iqueue.h,v 1.21 2012/02/07 13:37:46 thudson Exp $ */ + +/* + * Copyright (c) 2010 Two Sigma Investments, LLC + * All Rights Reserved + * + * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF + * Two Sigma Investments, LLC. + * + * The copyright notice above does not evidence any + * actual or intended publication of such source code. + */ + +#ifndef _iqueue_h_ +#define _iqueue_h_ + +#include "twosigma.h" +#include +#include +#include +#include "bswap.h" +#include "shash.h" + +__BEGIN_DECLS + +/** \file + * Multi-writer, persistent binary log file. + * + * Goals: + * 1. Lockless, multi-writer access to a memory mapped file + * 2. Threads can die at any time without corrupting meta data + * 3. Readers can track messages in order without communication to writers + * 4. Data file has no overhead from system; can have user specified + * headers, delimiters, etc. + */ + +typedef struct _iqueue iqueue_t; +typedef uint64_t iqueue_id_t; + + +/** Index queue message pointers contain the length and the offset + * packed into a single 64-bit value. + * + * The maximum size of the messages is defined by 64 - IQUEUE_MSG_BITS, + * or 1 MB. + */ +typedef struct { + uint64_t v; +} iqueue_msg_t; + +#define IQUEUE_MSG_SEALED ((uint64_t)-1) +#define IQUEUE_MSG_BITS 44 +#define IQUEUE_MSG_MASK ((((uint64_t) 1) << IQUEUE_MSG_BITS) - 1) +#define IQUEUE_MSG_MAX ((((uint64_t) 1) << (64 - IQUEUE_MSG_BITS)) - 1) +#define IQUEUE_MSG_BAD_ID ((uint64_t) -1) + + +static inline iqueue_msg_t +iqueue_msg( + uint64_t offset, + uint64_t len +) { + return (iqueue_msg_t) { + (len << IQUEUE_MSG_BITS) | (offset & IQUEUE_MSG_MASK) + }; +} + +static inline uint64_t +iqueue_msg_offset( + iqueue_msg_t msg +) +{ + return (uintptr_t)(msg.v & IQUEUE_MSG_MASK); +} + +static inline uint64_t +iqueue_msg_len( + iqueue_msg_t msg +) +{ + return (uint64_t)(msg.v >> IQUEUE_MSG_BITS); +} + + +/** Open an iqueue_t object backed by a file. + * + * @param writable: if true the file will be opened for read/write + * and entries can be added with iqueue_allocate() / iqueue_update(). + * + * @return NULL if there are any errors. + */ +iqueue_t * +iqueue_open( + const char * index_file, + bool writable +); + + +/** Create or open an iqueue_t object backed by a file. + * + * Atomically creates an iqueue file on disk, or returns the existing + * iqueue of the same name. iqueue_creation() can be called to determine + * if the iqueue that was created was new or an existing one. + * + * @param creation_time: Non-zero time in nanoseconds to record in the + * iqueue header. + */ +iqueue_t * +iqueue_create( + const char * index_file, + const uint64_t creation_time, + const void * user_hdr, + size_t user_hdr_len +); + + +/** Create a prefetch thread to pre-fault pages for writing. */ +int +iqueue_prefetch_thread( + iqueue_t * const iq, + pthread_t * const thread_out +); + +/** Create a syncbehind thread to write out and release old blocks. */ +int +iqueue_syncbehind_thread( + iqueue_t * const iq, + pthread_t * const thread_out +); + +/** Re-open an iqueue. + * Useful if the iqueue has been sealed, archived and a new one created. + * + * After a sucessful reopen, iqueue_creation() can be used to see if + * a new iqueue has been returned or if it is the same one. + * + * \return 0 on success, -1 on error and sets errno: + * ENOENT: File does not exist (not fatal, might be transitory due archiving) + * + */ +int +iqueue_reopen( + iqueue_t * iq +); + + +/** Close a iqueue_t. + * + * \note It is necessary that all threads using the iqueue_t + * be stopped before closing! + */ +void +iqueue_close( + iqueue_t * iqueue +); + + +/** Archive an iqueue by optionally sealing and renaming + * + * @param seal_id only archive if no new items have been added. + * If seal_id == -1, the archive will not be sealed. + * + * @return 0 on succecss, -1 on error and sets errno: + */ +int +iqueue_archive( + iqueue_t * iq, + uint64_t seal_id +); + + + + +/** mlock all current mapped segments and any future ones. + * + * It is necessary to have sufficient mlock-able ulimit + * to lock enough data to be useful. + * + * Run: + * ulimit -l unlimited + * + * Or configure the amount in /etc/security/limits: + * + * * - memlock 16777216 # 16 GB + */ +int +iqueue_mlock( + iqueue_t * iq +); + + +/** Return the number of entries in the iqueue. + * \note This is not guaranteed to be the last entry since other + * processes may have added new ones. However, it will make a best + * effort to ensure that there the pointer is actually at the end. + */ +uint64_t +iqueue_entries( + const iqueue_t * iqueue +); + + +/** Return the amount of space used in the data file */ +uint64_t +iqueue_data_len( + const iqueue_t * const iq +); + +/** Retrieve the user header from the iqueue. */ +void * +iqueue_header( + iqueue_t * iqueue, + size_t * hdr_len_out +); + + +/** Return the creation time of the iqueue */ +uint64_t +iqueue_creation( + const iqueue_t * iqueue +); + +/** Returns non-zero if iqueue has been sealed */ +int +iqueue_is_sealed( + iqueue_t * iqueue +); + + +/** Return the filename of the iqueue */ +const char * +iqueue_name( + const iqueue_t * iqueue +); + + +typedef enum { + IQUEUE_MADV_WILLNEED, + IQUEUE_MADV_DONTNEED, +} iqueue_madvise_t; + +/** Advise the kernel about regions of the iqueue. + * @param start and end define the indices that are of interest. + * + * \note Since the iqueue data segment might not + * be contiguous and in order, it is possible for the portion of + * the file defined by start to be earlier than the region + * defined by end. No madvise request will be made in that case. + */ +int +iqueue_madavise( + iqueue_t * iqueue, + iqueue_madvise_t advice, + iqueue_id_t start, + iqueue_id_t end +); + + +/** Retrieve the shared hash of writers. + * + * The iqueue maintains a list of entries of "writers" that can + * record monotonically increasing heartbeat values to indicate + * to other iqueue readers as to their status. The typical datum + * recorded is a timestamp, although it can be any 64-bit value + * as long as it is monotonically increasing. + * + * The iqueue has four such tables: + * * 0 is reserved for user heartbeats and will be synchronized by + * iqsync. + * * 1 is reserved for iqsync's shared source table. + * * 2 and 3 are available for users of the local iqueue. + * + * @param create will cause the hash to be created if it does not already + * exist. This is done in an atomic, idempotent fashion. + */ +shash_t * +iqueue_writer_table( + iqueue_t * const iq, + unsigned table_id, + int create +); + + +/** Update the status of the writer. + * + * \param sh must have been returned from iqueue_writer_table(). + * \param writer must have been returned by shash_insert() or shash_get(). + * + * \param timestamp is any value, although only values strictly higher + * will be written to the field with the exception of when the previous + * value of the timestamp has been set to -1, which can + * be used to indicate that a writer has exited or failed. + * + * \return 0 on success, 1 if the current value exceeds the timestamp + * and -1 on error. + */ +int +iqueue_writer_update( + shash_t * const sh, + shash_entry_t * const writer, + uint64_t timestamp +); + + + + +#define IQUEUE_BLOCK_SHIFT 30 +#define IQUEUE_BLOCK_SIZE ((uint64_t) (1 << IQUEUE_BLOCK_SHIFT)) +#define IQUEUE_BLOCK_MASK (IQUEUE_BLOCK_SIZE - 1) + + +/** Returns the id of the first entry in the iqueue */ +iqueue_id_t +iqueue_begin( + const iqueue_t * iq +); + + +/** Returns an id immediately after the last entry in the iqueue */ +iqueue_id_t +iqueue_end( + const iqueue_t * iq +); + + +#define IQUEUE_STATUS_OK 0 +#define IQUEUE_STATUS_HAS_DATA 1 +#define IQUEUE_STATUS_INDEX_INVALID 2 +#define IQUEUE_STATUS_NO_DATA 3 +#define IQUEUE_STATUS_SEALED 4 +#define IQUEUE_STATUS_INVALID_ARGUMENT 5 +#define IQUEUE_STATUS_NO_SPACE 6 + + +/** Returns in IQUEUE_STATUS_* code for the slot's state + * + * IQUEUE_STATUS_HAS_DATA There is a message in this slot. + * IQUEUE_STATUS_NO_DATA There is no message in this slot. + * IQUEUE_STATUS_INDEX_INVALID id is off the charts of a valid index. + * IQUEUE_STATUS_SEALED The iqueue has been sealed at this slot. + * IQUEUE_STATUS_INVALID_ARGUMENT Something is wrong. + */ +int +iqueue_status( + const iqueue_t * iq, + iqueue_id_t id +); + + +/** Waits up to timeout_ns nanoseconds (or indefinitely if -1) for a non- + * NO_DATA status for the given id and returns it. + */ +int +iqueue_status_wait( + iqueue_t * iq, + iqueue_id_t id, + int64_t timeout_ns +); + + + +/** Returns the offset of the message at id, or -1 on error. */ +uint64_t +iqueue_offset( + iqueue_t * iq, + iqueue_id_t id, + size_t * size_out +); + + +/** Return a pointer to the data at a given offset. + * If do_map is specified, the region will be brought into memory with mmap(). + */ +const void * +iqueue_get_data( + iqueue_t * iq, + uint64_t offset, + int do_map +); + + +/** Returns the data and size for the message at id, or NULL or error. + * + * The message is immutable once committed, so it is not permitted to + * modify the memory returned. + */ +static inline const void * +iqueue_data( + iqueue_t * const iq, + iqueue_id_t id, + size_t * const size_out +) +{ + uint64_t offset = iqueue_offset(iq, id, size_out); + if (unlikely(offset == (uint64_t) -1)) + return NULL; + + return iqueue_get_data(iq, offset, 1); +} + +typedef struct +{ + // Constants + iqueue_t * iq; + uint64_t bulk_len; + int auto_refill; + uint64_t align_mask; // mask for force alignment; 0 == none, 0x7 == 8 byte, 0x1f == 32 byte, etc + + // Updated every time a reallocation is done + uint8_t * base; + uint64_t base_offset; + uint64_t offset; +} iqueue_allocator_t; + + + +/** Allocate a region from the iqueue */ +void * +iqueue_allocate_raw( + iqueue_t * iq, + const size_t len, + iqueue_msg_t * msg_out +); + + +/** Get a bulk allocator for the iqueue. + * + * These allow fast thread-local allocation of messages without + * contending for the iqueue's data_tail pointer. + * + * If auto_refill is set, the allocator will automatically refill with + * a fresh bulk_size amount of data. If the data segment needs to be + * extended, this will also perform the extension. + */ +int +iqueue_allocator_init( + iqueue_t * iq, + iqueue_allocator_t * allocator, + size_t bulk_size, + int auto_refill +); + + +/** Get a new chunk for the allocator. + * \note if auto_refill is set when the allocator is created, + * this does not need to be called. + * + * \return 0 on success, -1 on failure. + */ +int +iqueue_allocator_refill( + iqueue_allocator_t * allocator +); + + +/** Allocate from the chunk reserved to the bulk allocator. + * + * If there is insufficient space in this allocator and auto_refill + * is not true, then NULL will be returned. Otherwise a new chunk + * will be reserved. + * + * \note It is possible to allocate more than the IQUEUE_MSG_MAX + * size, but it will not be possible to commit the resulting message + * to the index. + */ +static inline void * +iqueue_allocate( + iqueue_allocator_t * const allocator, + const size_t len, + iqueue_msg_t * msg_out +) +{ + if (unlikely(len > allocator->bulk_len)) + return NULL; + + while (1) + { + uint64_t base = allocator->base_offset + allocator->offset; + uint64_t aligned = (base + allocator->align_mask) + & ~allocator->align_mask; + uint64_t offset = aligned + len - allocator->base_offset; + + if (likely(offset <= allocator->bulk_len)) + { + allocator->offset = offset; + *msg_out = iqueue_msg(aligned, len); + return allocator->base + aligned - allocator->base_offset; + } + + // It didn't fit; try to get more space + if (!allocator->auto_refill + || iqueue_allocator_refill(allocator) < 0) + return NULL; + } +} + + + +/** Resize an allocation to make it smaller. + * + * \return 0 on inability to resize (not an error), + * 1 on success, and -1 on error. + */ +int +iqueue_realloc( + iqueue_allocator_t * allocator, + iqueue_msg_t * msg, + size_t new_len +); + + +/** Resize an oversized allocation. + * Some applications, like iqsync, might allocate more than the IQUEUE_MSG_MAX + * size messages and need a way to resize them that can not be represented by + * the iqueue_msg_t embedded offset. + * + * \return 0 on inability to resize (not an error), + * 1 on success, and -1 on error. + */ +int +iqueue_realloc_bulk( + iqueue_allocator_t * allocator, + iqueue_msg_t * msg, + size_t old_len, + size_t new_len +); + + + +/** Store a message at the end of the log, writing the index into + * a location with the same lockless guarantee as the index update. + * + * This means that the slot id in the index which will be written with + * pointer to this message will be written to id_out, stored before + * the message is committed to the index, allowing lockless consistency + * if id_out points to inside of the message itself. + * + * \return 0 on success, or an error code on failure: + * + * IQUEUE_STATUS_INDEX_INVALID == there is no space left in the index + * IQUEUE_STATUS_SEALED == the iqueue has been sealed against writing + * IQUEUE_STATUS_INVALID_ARGUMENT == bad offset or length in the iqmsg. + */ +int +iqueue_update( + iqueue_t * iq, + iqueue_msg_t msg, + iqueue_id_t * id_out +); + + +/** Same as iqueue_update, except writes to id_out in big-endian */ +int +iqueue_update_be( + iqueue_t * iq, + iqueue_msg_t msg, + iqueue_id_t * id_be_out +); + + +/** Convienence function to commit a message to the iqueue */ +static inline int +iqueue_append( + iqueue_t * const iq, + const void * const buf, + size_t len +) +{ + if (len >= IQUEUE_MSG_MAX) + return IQUEUE_STATUS_INVALID_ARGUMENT; + + iqueue_msg_t iqmsg; + void * const msg = iqueue_allocate_raw(iq, len, &iqmsg); + if (!msg) + return IQUEUE_STATUS_NO_SPACE; + + memcpy(msg, buf, len); + int rc = iqueue_update(iq, iqmsg, NULL); + return rc; +} + + + +/** Attempt to store a message in the log at the desired slot. + * entry should point to the buffer returned from iqueue_allocate() + * once all of the log data has been copied into it. + * + * \return same as iqueue_update, with the additional status: + * + * IQUEUE_STATUS_HAS_DATA == A message is present in this slot. + */ +int +iqueue_try_update( + iqueue_t * iq, + iqueue_id_t slot, + iqueue_msg_t msg +); + + +/** Seals the iqueue, blocking any further write attempts. + * + * Once an iqueue has been sealed no further writes will be permitted. + * iqueue_update() and iqueue_get_status() will return IQUEUE_STATUS_SEALED + * and iqueue_data() of an index past or equal to the index_tail will + * return an error. + * + * \return same as iqueue_update + */ +int +iqueue_seal( + iqueue_t * iq +); + + +/** Attempts to seal the iqueue at the given index, blocking any further + * write attempts. + * + * This is useful if the application wants to be sure that it has processed + * every message in an iqueue before sealing it, without racing with + * a writer. + * + * \return same as iqueue_try_update + */ +int +iqueue_try_seal( + iqueue_t * iq, + iqueue_id_t id +); + + +/** Prefetch some data. + * + * Typically this is invoked by the prefetch thread, but can be + * done by any user as well. + * + * \note Will grow the file if necessary and perform additional + * mappings to make the new regions resident. + */ +int +iqueue_prefetch( + iqueue_t * iq, + uint64_t offset, + uint64_t extent +); + + +/** Report some debugging information about an iqueue entry */ +void +iqueue_debug( + iqueue_t * iq, + uint64_t id +); + +__END_DECLS + +#endif diff --git a/include/shash.h b/include/shash.h new file mode 100644 index 0000000..6fd598a --- /dev/null +++ b/include/shash.h @@ -0,0 +1,161 @@ +/** \file + * Shared simple hash table. + * + * This lockless hash table is designed for use embedded in other + * dma/transport files, such as iqueue headers for use of iqsync. + * + * The underlying hash lookup algorithm is not efficient since it + * requires linear search, but the keys will never move once they + * are added to the hash so it is possible to have a local hash that + * maps pointers to the keys. + * + * The hash entries must be 16-byte aligned for the cmpxchgb16() to be + * able to safely write to them. + * + * There is no init function since the only requirement is that + * the hash entries must be zeroed when the space is made visible to + * other threads or processes. + * + * \note key == 0 is not allowed. + */ +#ifndef _ts_dma_transport_shash_ +#define _ts_dma_transport_shash_ + +#include + + +typedef struct { + const uint64_t key; + const volatile uint64_t value; +} shash_entry_t; + + +typedef struct _shash_t shash_t; + +__BEGIN_DECLS + + +/** Create a local representation of the shash. + * + * This does not create the underlying hash; shash_ptr must + * have already been allocated and may already be in use + * in a shared data structure. + * + * This stores a cache of the keys to speed up repeated calls to + * shash_lookup(). + * + * If read_only is set, shash_insert() and shash_update() will fail. + * If read_only is not set, the const-ness will be cast away from the + * buffer. + * + * Keys may not be deleted from a shash. Once inserted, they will always + * be present. + */ +shash_t * +shash_create( + const void * shash_ptr, + size_t shash_len, + int read_only +); + + +/** Create a thread-local copy of a shash_t. + * + * Since the get operations require a lock to protect the shadow + * hash, it can be expensive to serialize on the gets. Create + * a thread-local version that does not share its lock with the + * original shash. + */ +shash_t * +shash_copy( + shash_t * sh +); + + +/** Free local resources. + * + * This does not modify the underlying hash. + */ +void +shash_destroy( + shash_t * shash +); + + +/** Retrieve a shash entry from the hash. + * + * This has O(n) time if the key has not been seen before, + * unlike a normal hash, but repeated look ups will be O(1). + * + * If key does not exist in the hash, get will return NULL. + */ +shash_entry_t * +shash_get( + shash_t * sh, + uint64_t key +); + + +/** Insert a new key/value pair into the shared hash. + * + * This has O(n) time potentially since it will walk the hash + * to examine any new items. + * + * If the key already exists in the shared hash or if there is no + * space left in the shared hash, NULL will be returned. + * + * \note key == 0 is not allowed. + */ +shash_entry_t * +shash_insert( + shash_t * sh, + uint64_t key, + uint64_t value +); + + +/** Insert a new key/value pair or return the existing one. */ +static inline shash_entry_t * +shash_insert_or_get( + shash_t * sh, + uint64_t key, + uint64_t value +) +{ + shash_entry_t * entry = shash_insert(sh, key, value); + if (entry) + return entry; + return shash_get(sh, key); +} + + +/** Atomically set a new value if the old value has not changed. + * + * The internal logic is: + * + * If entry->value != old_value then + * return 0; + * entry->value = new_value; + * return 1; + */ +int +shash_update( + shash_t * sh, + shash_entry_t * entry, + uint64_t old_value, + uint64_t new_value +); + + +/** Return the pointer to the array of entries. + */ +shash_entry_t * +shash_entries( + shash_t * const sh, + unsigned * const max_entries +); + + +__END_DECLS + +#endif diff --git a/src/iqsync-main.c b/src/iqsync-main.c new file mode 100644 index 0000000..4a5227a --- /dev/null +++ b/src/iqsync-main.c @@ -0,0 +1,444 @@ +/* $TwoSigma: iqsync-main.c,v 1.6 2012/02/02 20:53:59 thudson Exp $ */ + +/* + * Copyright (c) 2010 Two Sigma Investments, LLC + * All Rights Reserved + * + * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF + * Two Sigma Investments, LLC. + * + * The copyright notice above does not evidence any + * actual or intended publication of such source code. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tsutil.h" +#include "tsio.h" +#include "tsclock.h" +#include "tsnet.h" +#include "iqueue.h" +#include "iqsync.h" +#include "segfault.h" + +__RCSID("$TwoSigma: iqsync-main.c,v 1.6 2012/02/02 20:53:59 thudson Exp $"); + + + +/** \file + * Push or pull changes from an iqueue with another over stdin/stdout + * tunneled through ssh, or over a TCP socket. + */ + +static struct option long_options[] = { + { "help", no_argument, 0, '?' }, + { "iqueue", required_argument, 0, 'f' }, + { "tail", no_argument, 0, 't' }, + { "server", required_argument, 0, 'R' }, + { "sleep", required_argument, 0, 's' }, + { "rate-limit", required_argument, 0, 'M' }, + { "report-interval", required_argument, 0, 'r' }, + { "type", required_argument, 0, 'T' }, + { "push", no_argument, 0, 'p' }, + { "pull", no_argument, 0, 'P' }, + { "nop", no_argument, 0, 'Z' }, + { "validate", no_argument, 0, 'V' }, + { "verbose", no_argument, 0, 'v' }, + { "clone", no_argument, 0, 'C' }, + { "remote-cpu", required_argument, 0, 'K' }, + { "cpu", required_argument, 0, 'c' }, + { "prefetch", no_argument, 0, 'e' }, + { "syncbehind", no_argument, 0, 'b' }, + { 0, 0, 0, 0}, +}; + +static void +__attribute__((noreturn)) +usage( + FILE * stream, + const char * msg +) +{ + static const char usage_str[] = +"Options:\n" +" -h | -? | --help This help\n" +" -f | --iqueue /path/file Local iqueue (required)\n" +" -V | --validate Validate that the iqueue headers match\n" +" -M | --rate-limit N Read no more than N MB/s\n" +" -p | --push Push local changes\n" +" -P | --pull Pull remote changes\n" +" -s | --sleep N Sleep N useconds when there is no data\n" +" -r | --report-interval N Report TX/RX stats every N seconds\n" +" -t | --tail Keep tracking entries as they are added\n" +" -v | --verbose Report every TX/RX message\n" +" -T | --type {ssh|tcp} Transport type (default ssh)\n" +" -c | --cpu N Bind local push thread to CPU N\n" +" -K | --remote-cpu N Bind remote push thread to CPU N (only with ssh)\n" +" -e | --prefetch Create prefetch thread\n" +" -b | --syncbehind Create syncbehind thread\n" +"\n" +"Cloning options:\n" +" Cloning implies --pull and --validate. If the local sizes are not\n" +" specified the remote sizes will be used. Bi-directional cloning\n" +" is supported.\n" +"\n" +" -C | --clone Clone a remote iqueue\n" +"\n" +"\n" +"Push/pull:\n" +" The direction of the operation is from the view of the local process:\n" +"\n" +" 'iqsync --pull' or 'iqsync --clone' will open the remote queue read-only,\n" +" read entries from it and write them to the local queue.\n" +"\n" +" 'iqsync --push' will open the local queue read-only, read entries from\n" +" it and write to the remote queue. It will not create a set of remote\n" +" files; they must already exist or iqsync will exit with an error.\n" +"\n" +" 'iqsync --push --pull' will open both files read/write and merge\n" +" their entries. Keep in mind that the order of the entries that already\n" +" exist in each queue will never change, so the queues will not be exactly\n" +" identical, but all entries in the remote queue will appear in the same\n" +" order relative to each other.\n" +"\n" +"\n" +"SSH Usage:\n" +" iqsync [options...] [user@]host:/path/to/remote.iqx\n" +"\n" +" Environment variables in SSH mode:\n" +" IQSYNC_CMD The command to be run on the remote side\n" +" IQSYNC_RSH The local rsh/ssh command to be run\n" +"\n" +"\n" +"TCP Server Usage:\n" +" iqsync --type tcp --server [IPADDR:]PORT [options...]\n" +"\n" +"TCP Client Usage:\n" +" iqsync --type tcp [options...] IPADDR:PORT\n" +"\n" +"To invoke from netcat, create a shell script to invoke iqsync and\n" +"invoke it with \"--server unused --type ssh\". Netcat will wait for\n" +"a connection and then iqsync will read/write from stdin/stdout over\n" +"the socket.\n" +"\n" +; + + fprintf(stream, "%s%s", msg, usage_str); + exit(EXIT_FAILURE); +} + + +static int +iqsync_setup_ssh( + iqsync_t * const iqsync, + const char * remote_name +) +{ + // If we are in srever mode, everything is setup + if (iqsync->do_server) + return 0; + + // Get remote host:file from argv + iqsync->remote.name = remote_name; + if (!iqsync->remote.name) + TSABORTX("Remote iqueue must be specified"); + + char * remote_host = strdup(iqsync->remote.name); + char * remote_file = index(remote_host, ':'); + if (!remote_file) + TSABORTX("Unable to parse remote iqueue name '%s'", remote_host); + *(remote_file++) = '\0'; + + // Check for an env variable for the iqsync command + const char * remote_cmd = getenv("IQSYNC_CMD"); + if (!remote_cmd) + remote_cmd = "iqsync"; + const char * ssh_cmd = getenv("IQSYNC_RSH"); + if (!ssh_cmd) + ssh_cmd = "/usr/bin/ssh"; + + + // Determine my name to pass as the remote hostname + char my_name[1024]; + if (gethostname(my_name, sizeof(my_name)) < 0) + TSABORT("Unable to get my hostname"); + int name_len = strlen(my_name); + my_name[name_len++] = ':'; + strncpy(my_name + name_len, iqsync->local.name, sizeof(my_name) - name_len); + + char usleep_str[16]; + snprintf(usleep_str, sizeof(usleep_str), "%d", iqsync->usleep_time); + + char rate_limit_str[16]; + snprintf(rate_limit_str, sizeof(rate_limit_str), "%"PRIu64, iqsync->rate_limit); + + // Redirect stdin/stdout, but let it write to our stderr + int fds[3]; + pid_t child = tsio_open3( + fds, + TSIO_STDIN_MASK | TSIO_STDOUT_MASK, + ssh_cmd, + (const char *[]) { + ssh_cmd, + remote_host, + remote_cmd, + "--server", my_name, + "--type", "ssh", + "-f", + remote_file, + "--sleep", usleep_str, + "--rate-limit", rate_limit_str, + iqsync->verbose ? "--verbose" : "--nop", + iqsync->do_push ? "--pull" : "--nop", // note reversed sense + iqsync->do_pull ? "--push" : "--nop", // note reversed sense + iqsync->do_tail ? "--tail" : "--nop", + iqsync->remote_cpu ? "--cpu" : "--nop", + iqsync->remote_cpu ? iqsync->remote_cpu : "--nop", + 0 + } + ); + if (child < 0) + TSABORTX("Unable to fork %s", ssh_cmd); + + iqsync->read_fd = fds[1]; + iqsync->write_fd = fds[0]; + + return 0; +} + + +static int +iqsync_setup_tcp( + iqsync_t * const iqsync, + const char * const remote_name +) +{ + static char default_port[] = "20809"; + static char default_bind[] = "0.0.0.0"; + + if (!iqsync->do_server) + { + // client connects make a TCP socket and are done. + char * name = strdup(remote_name); + char * port = strchr(name, ':'); + if (port) + *port++ = '\0'; + else + port = default_port; + + TSLOGX(TSINFO, "%s: Connecting to %s:%s", + iqsync->local.name, + name, + port + ); + + int fd = tsnet_tcp_client_socket(name, port, 0); + if (fd < 0) + { + TSLOG(TSERROR, "Unable to connect to %s:%s", name, port); + return -1; + } + + iqsync->read_fd = fd; + iqsync->write_fd = fd; + iqsync->remote.name = name; + return 0; + } + + // Make sure the parameters are correct; can not do a clone into + // a non-existant iqueue + // only get write access if we are pulling i + const bool writable = iqsync->do_pull ? true : false; + iqsync->iq = iqueue_open(iqsync->local.name, writable); + if (!iqsync->iq) + { + TSLOGX(TSERROR, "%s: Unbale to open", iqsync->local.name); + return -1; + } + + // Use the remote server name for [[IP:]PORT] to bind to + char * server_name = default_bind; + char * port_name = default_port; + + char * orig_name = strdup(iqsync->remote.name); + if (iqsync->remote.name[0] != '\0') + { + char * colon = strchr(orig_name, ':'); + if (!colon) + port_name = orig_name; + else { + *port_name++ = '\0'; + server_name = orig_name; + } + } + + int server_fd = tsnet_tcp_server_socket(server_name, port_name, 0); + if (server_fd < 0) + TSABORT("tcp bind to %s:%s", server_name, port_name); + + TSLOGX(TSINFO, + "%s: Waiting for inbound connections on TCP port %s:%s", + iqsync->local.name, + server_name, + port_name + ); + + while (1) + { + struct sockaddr_in remote_addr; + socklen_t remote_len = sizeof(remote_addr); + int fd = accept(server_fd, &remote_addr, &remote_len); + if (fd < 0) + TSABORT("accept %s:%s", server_name, port_name); + + // Duplicate the iqsync_t for the new connection + iqsync_t * const new_iqsync = calloc(1, sizeof(*new_iqsync)); + if (!new_iqsync) + TSABORT("unable to allocate iqsync object"); + *new_iqsync = *iqsync; + + char client_name[256]; + snprintf(client_name, sizeof(client_name), "%s:%d", + inet_ntoa(remote_addr.sin_addr), + ntohs(remote_addr.sin_port) + ); + new_iqsync->remote.name = strdup(client_name); + + TSLOGX(TSINFO, "%s: Connected to %s", + new_iqsync->local.name, + new_iqsync->remote.name + ); + + new_iqsync->read_fd = fd; + new_iqsync->write_fd = fd; + + if (iqsync_start(new_iqsync) < 0) + return -1; + + // Detatch from the threads so that they will exit cleanly + // This will leak the new_iqsync object, but that is ok for now. + pthread_detach(new_iqsync->push_thread); + pthread_detach(new_iqsync->pull_thread); + pthread_detach(new_iqsync->stat_thread); + } + + // Unreachable + free(orig_name); + close(server_fd); + exit(EXIT_SUCCESS); +} + + +int +main( + int argc, + char **argv +) +{ + //segfault_handler_install(); + iqsync_t * const iqsync = calloc(1, sizeof(*iqsync)); + + *iqsync = (iqsync_t) { + .report_interval = 600, + .usleep_time = 100, + .heartbeats_lock = tslock_alloc(), + .read_fd = STDIN_FILENO, + .write_fd = STDOUT_FILENO, + }; + + int option_index = 0; + const char * usleep_time_str = "0"; + const char * transport_type = "ssh"; + bool prefetch = false; + bool syncbehind = false; + + while (1) + { + int c = getopt_long( + argc, + argv, + "h?f:ts:pvVZR:Cr:T:m:c:K:eb", + long_options, + &option_index + ); + + if (c == -1) + break; + + switch (c) + { + case 0: break; + default: usage(stderr, ""); break; + case 'h': case '?': usage(stdout, ""); break; + + case 'Z': break; // nop + case 'C': + iqsync->do_clone = 1; + iqsync->do_hdr_validate = 1; + iqsync->do_pull = 1; + break; + case 'f': iqsync->local.name = optarg; break; + case 't': iqsync->do_tail = 1; break; + case 'c': iqsync->local_cpu = optarg; break; + case 'K': iqsync->remote_cpu = optarg; break; + case 'M': iqsync->rate_limit = strtoul(optarg, NULL, 0); break; + case 'T': transport_type = optarg; break; + case 'p': iqsync->do_push = 1; break; + case 'P': iqsync->do_pull = 1; break; + case 'V': iqsync->do_hdr_validate = 1; break; + case 'v': iqsync->verbose++; tslevel = TSDEBUG; break; + case 'r': iqsync->report_interval = strtoul(optarg, NULL, 0); break; + case 'R': + iqsync->do_server = 1; + iqsync->remote.name = optarg; + break; + case 's': + usleep_time_str = optarg; + iqsync->usleep_time = strtoul(optarg, 0, 0); + break; + case 'e': + prefetch = true; + break; + case 'b': + syncbehind = true; + break; + } + } + + if (!iqsync->local.name) + usage(stderr, "iqueue file must be specified!\n"); + + if (!iqsync->do_push && !iqsync->do_pull) + usage(stderr, "At least one of --push / --pull must be specified!\n"); + + if (strcmp(transport_type, "ssh") == 0) + { + if (iqsync_setup_ssh(iqsync, argv[optind]) < 0) + return -1; + } else + if (strcmp(transport_type, "tcp") == 0) + { + if (iqsync_setup_tcp(iqsync, argv[optind]) < 0) + return -1; + } else + usage(stderr, "Unknown --type option!\n"); + + iqsync->do_prefetch = prefetch; + iqsync->do_syncbehind = syncbehind; + + // All configured. Start the threads + if (iqsync_start(iqsync) < 0) + return -1; + + if (iqsync_wait(iqsync) < 0) + return -1; + + return 0; +} diff --git a/src/iqsync.c b/src/iqsync.c new file mode 100644 index 0000000..d399b77 --- /dev/null +++ b/src/iqsync.c @@ -0,0 +1,1604 @@ +/* $TwoSigma: iqsync.c,v 1.24 2012/02/07 13:37:40 thudson Exp $ */ + +/* + * Copyright (c) 2010 Two Sigma Investments, LLC + * All Rights Reserved + * + * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF + * Two Sigma Investments, LLC. + * + * The copyright notice above does not evidence any + * actual or intended publication of such source code. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bswap.h" +#include "tsutil.h" +#include "tslock.h" +#include "tsio.h" +#include "tsclock.h" +#include "tssched.h" +#include "iqueue.h" +#include "iqsync.h" +#include "segfault.h" + +__RCSID("$TwoSigma: iqsync.c,v 1.24 2012/02/07 13:37:40 thudson Exp $"); + +/** \file + * Core iqsync algorithm and threads. + * + * See iqsync-main.c for an example of how to create an iqsync_t object + * and start them. + */ + +void +iqsync_stats( + iqsync_t * const iqsync +) +{ + const uint64_t now = tsclock_getnanos(0); + + const uint64_t rx_len = iqsync->remote.len; + const uint64_t rx_count = iqsync->remote.count; + const uint64_t tx_count = iqsync->local.count; + const uint64_t report_delta = now - iqsync->report_time; + + if (iqsync->do_pull) + TSLOGX(TSINFO, "%s: RX index %"PRId64": %"PRIu64" messages %.3f kpps, %.2f MB/s", + iqsync->remote.name, + iqsync->remote.index, + rx_count, + (rx_count - iqsync->report_rx_count) * 1.0e6 / report_delta, + (rx_len - iqsync->report_rx_len) * 1.0e3 / report_delta + ); + + if (iqsync->do_push) + TSLOGX(TSINFO, "%s: TX index %"PRId64": %"PRIu64" messages %.3f kpps, avg size %"PRIu64" bytes", + iqsync->remote.name, + iqsync->local.index, + tx_count, + (tx_count - iqsync->report_tx_count) * 1.0e6 / report_delta, + iqsync->avg_msg_len + ); + + iqsync->report_rx_count = rx_count; + iqsync->report_tx_count = tx_count; + iqsync->report_rx_len = rx_len; + iqsync->report_time = now; +} + + +static void * +iqsync_stat_thread( + void * const iqsync_ptr +) +{ + iqsync_t * const iqsync = iqsync_ptr; + if (iqsync->report_interval == 0) + return NULL; + + while (!iqsync->do_shutdown) + { + sleep(iqsync->report_interval); + iqsync_stats(iqsync); + } + + return NULL; +} + + + +/** Return a pointer to the containing iqsync_msg_t if the + * message in the iqueue is of type iqsync_msg. Returns NULL + * otherwise. + */ +const struct iqsync_data * +iqsync_data_msg( + iqueue_t * const iq, + const uint64_t offset +) +{ + const void * const data = iqueue_get_data(iq, offset, 1); + if (!data) + return NULL; + + const struct iqsync_data * const msg = container_of( + (void*)(uintptr_t) data, + const struct iqsync_data, + data + ); + + // Make sure that the offset does not cross the front of + // a block boundary, which would indicate that this is not + // a iqsync message, and also check the magic value + // to be sure that the msg header is intact. + if ((offset & IQUEUE_BLOCK_MASK) < offsetof(struct iqsync_data, data) + || msg->magic != htobe64(IQSYNC_DATA_MAGIC)) + return NULL; + + // It appears to be a valid iqsync message. + return msg; +} + + +/** Setup the local source table, initializing the persistent source table + * if it does not yet exist. + */ +static int +iqsync_sources_setup( + iqsync_t * const iqsync +) +{ + iqueue_t * const iq = iqsync->iq; + shash_t * const sh = iqueue_writer_table(iq, 1, 1); + + if (!sh) + { + TSLOGX(TSERROR, "%s: Unable to create iqsync sources table", + iqueue_name(iq) + ); + return -1; + } + + iqsync->sources = shash_copy(sh); + iqsync->scan_index = shash_insert_or_get(iqsync->sources, -1, 0); + + TSLOGX(TSINFO, "%s: Sources table skipping to index %"PRIu64, + iqueue_name(iq), + iqsync->scan_index->value + ); + + return 0; +} + + +static int +iqsync_hash_update( + shash_t * const sh, + const uint64_t src_id, + const uint64_t src_index +) +{ + shash_entry_t * source = shash_get(sh, src_id); + + if (!source) + { + // Writer does not yet exist; create a new one. + // If this succeeds, the source id not yet exist and the newly + // created one will have the value that we provided. + source = shash_insert(sh, src_id, src_index); + + // If it did not succeed, then we raced with another thread to + // create this entry in the hash and must follow the update + // protocol by retrieving the existing one + if (!source) + source = shash_get(sh, src_id); + + // If there is still no source, the iqueue is corrupted + if (!source) + TSABORTX("corrupt iqueue? bad source behaviour"); + } + + return iqueue_writer_update(sh, source, src_index); +} + + +/** Retrieve a source from the shared hash, or insert it with + * a zero-value if it does not yet exist. + */ +static uint64_t +iqsync_sources_get( + iqsync_t * const iqsync, + const uint64_t src_id +) +{ + if (src_id == 0) + return 0; + + shash_entry_t * const source = shash_insert_or_get( + iqsync->sources, + src_id, + 0 + ); + + return source->value; +} + + +/** Read all messages newer than the scan index shared variable + * and update the shared sources hash table through the end of the + * iqueue. + * + * \param src_index_out will be set to point to the latest src index + * for the src_id, if it is non-zero. + * + * \return The local index slot id into which the message should be stored. + */ +static uint64_t +iqsync_sources_scan_all( + iqsync_t * const iqsync, + const uint64_t src_id, + uint64_t * const src_index_out +) +{ + iqueue_t * const iq = iqsync->iq; + + // Read the current shared scan index. The shared hash table cache + // is guaranteed to be current at least up to this value. It might + // be newer if a writer has crashed between updating the hash + // but before it could update the scan index. In either event, + // this process must scan from that point to the end of the iqueue + // and bring the shared hash table up to date. + // + // It is very important that this value is read before the expected + // value for src_id. The race is if the hash[src_id] is read, and then + // some values from src_id are added, and another process moves scan_index + // to the end of the iqueue, then this reader will not see the messages + // that would have updated hash[src_id] and will write duplicate messages. + const uint64_t orig_scan_index = iqsync->scan_index->value; + + // Retrieve the current value of the cached src index for src id. + // This might be newer than the value of scan_index, but that is ok + // since the linear scan will pass by the message that would have + // advanced hash[src_id]. + uint64_t src_index = iqsync_sources_get(iqsync, src_id); + + uint64_t scan_index = orig_scan_index; + + while (1) + { + size_t len; + const uint64_t offset = iqueue_offset(iq, scan_index, &len); + + // At the end of the queue? We're done scanning. + if (offset == (uint64_t) -1) + break; + + // There is a new message; update our iterator + scan_index++; + + // If the scanned slot does not contain an iqsync message, + // we can ignore it and move on to the next slot + const struct iqsync_data * const old_msg = iqsync_data_msg(iq, offset); + if (!old_msg) + continue; + + // Update the hash table for this original source and the iqueue + // through which it might have been routed. + // If this races with other iqsyncs, the highest value will win. + // The update does not need to do an atomic in the case were another + // iqsync has already updated the table. + const uint64_t orig_src = be64toh(old_msg->orig_src); + const uint64_t orig_index = be64toh(old_msg->orig_index); + const uint64_t route_src = be64toh(old_msg->src); + const uint64_t route_index = be64toh(old_msg->iq_index); + + // If the source and routed sources are the same, the indices + // had better agree. Otherwise something has gone horribly wrong + // in the protocol. + if (orig_src == route_src + && orig_index != route_index) + TSABORTX("%s: iqsync protocol error!" + " original source %"PRIu64".%"PRIu64 + " != routed %"PRIu64".%"PRIu64 + "!", + iqsync->remote.name, + orig_src, + orig_index, + route_src, + route_index + ); + + // Update the original and routing iqueue indices in the shared table + // The value recorded is the next expected index, not the most + // recently seen index. This allows the value to start at 0, + // rather than -1. + iqsync_hash_update(iqsync->sources, orig_src, orig_index + 1); + if (orig_src != route_src) + iqsync_hash_update(iqsync->sources, route_src, route_index + 1); + + // If the caller has specified a source that matches either id, + // update the tell them the index for that source + if (src_id == orig_src) + src_index = orig_index + 1; + else + if (src_id == route_src) + src_index = route_index + 1; + } + + // We've hit the end of the iqueue. Update the tail pointer in + // in the shared hash table with the location to begin searching the + // next scan; this might fail if other threads have + // advanced it since this process read the scan index, but that is ok + // since the scan_index is only maintained with loose consistency. + const uint64_t cur_scan_index = iqsync->scan_index->value; + if (cur_scan_index < scan_index) + shash_update( + iqsync->sources, + iqsync->scan_index, + cur_scan_index, + scan_index + ); + + // Indicate to the caller the src_index of the last message from + // src_id. + if (src_index_out) + *src_index_out = src_index; + + // And return how far we have scanned through the iqueue. + // If there is no race, this will be the first empty element of + // the iqueue, which is where it will append the incoming message. + + if (orig_scan_index != scan_index) + TSLOGX(TSDEBUG, "%s: Scanned from %"PRIu64" to %"PRIu64, + iqsync->local.name, + orig_scan_index, + scan_index + ); + + return scan_index; +} + + +/** Read all the messages that have not yet been processed to update + * the latest sequence number from each of the sources. + * + * \param cur_src The incoming message source to check against, or NULL to + * examine the entire queue without concern about sources. + * + * \return The slot that the current message should be written to, + * or IQUEUE_MSG_BAD_ID if the message should be discarded,. + */ +static inline uint64_t +iqsync_sources_scan( + iqsync_t * const iqsync, + const uint64_t cur_src, + const uint64_t cur_index +) +{ + // One of our own? + if (cur_src == iqsync->local.creation) + goto discard; + + // Refresh to the end of the iqueue and retrieve the + // latest message index from the current source. + uint64_t src_index; + const uint64_t scan_index = iqsync_sources_scan_all( + iqsync, + cur_src, + &src_index + ); + + // So far, so good. If the incoming message is the next expected + // one, then we're done. + if (cur_index >= src_index) + return scan_index; + + // The incoming message has already been seen. + // Signal that it should be discarded. +discard: + if (iqsync->verbose || iqsync->warned_cycle == 0) + { + iqsync->warned_cycle = 1; + TSLOGX(TSINFO, "%s: Discarding %"PRIu64".%"PRIu64, + iqsync->remote.name, + cur_src, + cur_index + ); + } + + return IQUEUE_MSG_BAD_ID; +} + + +static int +iqsync_start_recv( + iqsync_t * const iqsync +) +{ + iqueue_t * const iq = iqsync->iq; + + // Read where they want us to start + struct iqsync_start start; + if (tsio_read_all( + iqsync->read_fd, + &start, + sizeof(start) + ) != sizeof(start)) + { + TSLOGX(TSERROR, "%s: Start message read failed", iqsync->remote.name); + return -1; + } + + const uint64_t remote_magic = be64toh(start.magic); + if (remote_magic != IQSYNC_START_MAGIC) + { + TSLOGX(TSERROR, "%s: Start message bad magic %"PRIx64" != expected %"PRIx64, + iqsync->remote.name, + remote_magic, + IQSYNC_START_MAGIC + ); + return -1; + } + + iqsync->local.index = be64toh(start.start_index); + //uint64_t flags = be64toh(start.flags); + + if (iqsync->local.index > iqueue_entries(iq)) + TSLOGX(TSWARN, + "%s: Starting at %"PRIu64", but only %"PRIu64" entries so far", + iqsync->local.name, + iqsync->local.index, + iqueue_entries(iq) + ); + + if (iqsync->verbose) + TSLOGX(TSINFO, "%s: Starting at %"PRIu64"/%"PRIu64" and will %s when done", + iqsync->local.name, + iqsync->local.index, + iqueue_entries(iq), + iqsync->do_tail ? "tail" : "exit" + ); + + return 0; +} + + +static int +iqsync_push_one( + iqsync_t * const iqsync, + const uint64_t local_index, + const uint64_t offset, + size_t data_len +) +{ + struct iqsync_data msg = { + .magic = htobe64(IQSYNC_DATA_MAGIC), + .src = htobe64(iqsync->local.creation), + .orig_src = htobe64(iqsync->local.creation), + .orig_index = htobe64(local_index), + .iq_index = htobe64(local_index), + .len = htobe32((uint32_t) data_len), + }; + + // Check to see if this is one that we received from + // the remote side. If so, we do not send it on. + const struct iqsync_data * const sync_msg = iqsync_data_msg(iqsync->iq, offset); + if (sync_msg) + { + // Avoid the obvious cycle to our direct correspondent + if (sync_msg->src == htobe64(iqsync->remote.creation)) + return 1; + + // Flag the message with the original source and index + // Note that the values in the sync_msg are already in + // network byte order + msg.orig_src = sync_msg->orig_src; + msg.orig_index = sync_msg->orig_index; + } + + const void * data = iqueue_get_data(iqsync->iq, offset, 1); + + struct iovec iov[] = { + { .iov_base = &msg, .iov_len = sizeof(msg) }, + { .iov_base = (void*)(uintptr_t) data, .iov_len = data_len }, + }; + + size_t total_len = iov[0].iov_len + iov[1].iov_len; + + if (iqsync->verbose) + TSLOGX(TSINFO, "%s: sending index %"PRIu64": %zu bytes", + iqsync->local.name, + local_index, + data_len + ); + + ssize_t wlen = tsio_writev_all(iqsync->write_fd, total_len, iov, 2); + if (wlen < 0) + { + TSLOG(TSERROR, "%s: write failed!", iqsync->remote.name); + return -1; + } + + if (wlen != (ssize_t) total_len) + { + TSLOGX(TSWARN, "%s: Connection closed", iqsync->remote.name); + return 0; + } + + iqsync->local.count++; + return 1; +} + + +/** Prevent the aggregate read rate from exceeding the rate limit. + * This makes iqsync on a large file less likely to blow out the caches. + */ +static void +iqsync_rate_limit( + iqsync_t * const iqsync, + uint64_t start_time, + size_t len +) +{ + // Compute a moving, weighted average of the message len to determine + // the appropriate sleep time + const uint64_t avg_len = iqsync->avg_msg_len = (iqsync->avg_msg_len * 7 + len) / 8; + const uint64_t limit = iqsync->rate_limit << 20; // scale MB/s to B/s + + if (limit == 0) + return; + + uint64_t delta = tsclock_getnanos(0) - start_time; + + uint64_t ns_sleep_time = (avg_len * 1000000000ul) / limit; + if (ns_sleep_time < delta) + return; + + if (ns_sleep_time < 60000) + { + // Just busy wait until our time period has expired + while ((uint64_t) tsclock_getnanos(0) < start_time + ns_sleep_time) + continue; + } else { + // The minimum sleep time seems to be about 60 usec, + ns_sleep_time -= delta; + nanosleep(&(struct timespec) { + .tv_sec = ns_sleep_time / 1000000000ull, + .tv_nsec = ns_sleep_time % 1000000000ull + }, NULL); + } +} + + +/** Setup the heartbeat table in the iqueue and local maps of it. + * + * If create_flag is not set, the function only checks to see if + * there exists a heartbeat table. + * Otherwise it will try to create one and allocate the local copies + * for determining when heartbeat status has changed. + */ +static int +iqsync_setup_heartbeats( + iqsync_t * const iqsync, + const int create_flag +) +{ + if (iqsync->heartbeats_hash) + return 1; + + tslock(iqsync->heartbeats_lock); + if (iqsync->heartbeats_hash) + { + tsunlock(iqsync->heartbeats_lock); + return 1; + } + + shash_t * sh = iqueue_writer_table( + iqsync->iq, + 0, // default writer table + create_flag + ); + + // If it still doesn't exist, don't worry about it. No heartbeats + // will be pushed until it is created. + if (!sh) + { + if (create_flag) + TSABORTX("%s: Unable to create heartbeat table", + iqsync->local.name + ); + + tsunlock(iqsync->heartbeats_lock); + return 0; + } + + // Create a thread-local version + sh = shash_copy(sh); + iqsync->heartbeats = shash_entries(sh, &iqsync->heartbeats_max); + + TSLOGX(TSDEBUG, "%s: Heartbeat table %p has %u entries", + iqsync->local.name, + iqsync->heartbeats, + iqsync->heartbeats_max + ); + + iqsync->heartbeats_copy = calloc(1, + iqsync->heartbeats_max * sizeof(*iqsync->heartbeats_copy) + ); + if (!iqsync->heartbeats_copy) + TSABORT("failed to allocate %u writers", iqsync->heartbeats_max); + + iqsync->heartbeat_msg = calloc(1, + sizeof(*iqsync->heartbeat_msg) + + iqsync->heartbeats_max * sizeof(*iqsync->heartbeat_msg->writers) + ); + if (!iqsync->heartbeat_msg) + TSABORT("failed to allocate %u writers message", iqsync->heartbeats_max); + + iqsync->heartbeats_hash = sh; + tsunlock(iqsync->heartbeats_lock); + + return 1; +} + + +/** Send a set of messages, from iqsync->local.index to end_index. + * \return 0 on success, -1 on any failures. + */ +static int +iqsync_send_set( + iqsync_t * const iqsync, + const uint64_t end_index +) +{ + const uint64_t start_index = iqsync->local.index; + + while (iqsync->local.index < end_index) + { + const uint64_t i = iqsync->local.index++; + size_t len; + const uint64_t offset = iqueue_offset(iqsync->iq, i, &len); + + if (offset == (uint64_t) -1) + { + TSLOGX(TSERROR, "%s: No data at index %"PRIu64"?", + iqsync->local.name, + i + ); + return -1; + } + + uint64_t start_time = tsclock_getnanos(0); + if (iqsync_push_one(iqsync, i, offset, len) <= 0) + return -1; + + iqsync_rate_limit(iqsync, start_time, len); + } + + if (start_index != end_index) + TSLOGX(TSDEBUG, "%s: Send %"PRIu64" to %"PRIu64, + iqsync->local.name, + start_index, + end_index + ); + + return 0; +} + + + +static int +iqsync_push_heartbeats( + iqsync_t * const iqsync +) +{ + if (!iqsync_setup_heartbeats(iqsync, 0)) + return 0; + struct iqsync_heartbeat * const msg = iqsync->heartbeat_msg; + + unsigned count = 0; + + for (unsigned i = 0 ; i < iqsync->heartbeats_max ; i++) + { + // This does not guarantee ordering of updates to different keys. + const shash_entry_t heartbeat = iqsync->heartbeats[i]; + shash_entry_t * const copy = &iqsync->heartbeats_copy[i]; + if (heartbeat.key == 0) + break; + if (heartbeat.key == copy->key && heartbeat.value == copy->value) + continue; + + // A new timestamp. Update the cached copy + memcpy(copy, &heartbeat, sizeof(*copy)); + memcpy(&msg->writers[count++], &heartbeat, sizeof(*copy)); + } + + if (count == 0) + return 0; + + msg->magic_be64 = htobe64(IQSYNC_HEARTBEAT_MAGIC); + msg->count_be64 = htobe64(count); + + // Make sure that all pending messages have been sent + // to ensure that heartbeats do not arrive before any messages + // that were written before the hearbeat. + if (iqsync_send_set(iqsync, iqueue_entries(iqsync->iq)) < 0) + return -1; + + // At this point the cached copy of the heartbeat table might be + // old, but it meets the guarantee that all messages that were + // present at the time it was duplicated have been sent to the + // destination iqueue. It is now safe to send the entire table + // of updates. + + ssize_t wlen = tsio_write_all( + iqsync->write_fd, + msg, + sizeof(*msg) + count * sizeof(msg->writers[0]) + ); + if (wlen <= 0) + return -1; + + TSLOGX(TSDEBUG, "%s: Sent %u heartbeat updates", iqsync->local.name, count); + + return 0; +} + + +/** Send entries to the remote side. + * + * At this point everything is correctly configured and we have exclusive + * access to the write file descriptor. + */ +static void * +iqsync_push_thread( + void * const iqsync_ptr +) +{ + iqsync_t * const iqsync = iqsync_ptr; + iqueue_t * const iq = iqsync->iq; + + while (!iqsync->do_shutdown) + { + // By only sending up to the end of the iqueue at the current + // time, we ensure that progress is made on the heartbeat sending. + if (iqsync_send_set(iqsync, iqueue_entries(iq)) < 0) + { + TSLOGX(TSWARN, "%s: Send set failed", iqueue_name(iq)); + break; + } + + // Everytime there is no data, check for heartbeats + if (iqsync_push_heartbeats(iqsync) < 0) + { + TSLOGX(TSWARN, "%s: Heartbeat send failed", iqueue_name(iq)); + break; + } + + if (!iqsync->do_tail) + { + TSLOGX(TSINFO, "%s: Reached end and not tailing", iqueue_name(iq)); + break; + } + + if (iqueue_is_sealed(iq)) + { + TSLOGX(TSINFO, "%s: Has been sealed", iqueue_name(iq)); + break; + } + + if (iqsync->usleep_time) + usleep(iqsync->usleep_time); + } + + if (iqsync->verbose) + TSLOGX(TSINFO, "%s: Done sending at index %"PRIu64, + iqsync->remote.name, + iqsync->local.index + ); + + close(iqsync->write_fd); + if (!iqsync->do_pull) + iqsync->do_shutdown = 1; + + return NULL; +} + + +/** Receive a new iqsync_data msg and determine if it is a duplicate of + * one already received. + * + * \param msg should point to a message that has been received in the + * iqueue that is being synchronized. + * + * \return 1 if committed, 0 if it is a duplicate and should be discarded + * (with iqueue_realloc() to zero length) or -1 on error. + */ +static int +iqsync_recv( + iqsync_t * const iqsync, + const struct iqsync_data * const msg, + iqueue_msg_t iqmsg +) +{ + const size_t data_len = be32toh(msg->len); + const uint64_t orig_src = be64toh(msg->orig_src); + const uint64_t orig_index = be64toh(msg->orig_index); + const uint64_t remote_index = be64toh(msg->iq_index); + + // Adjust the message to skip the data at the head + const size_t data_offset = offsetof(struct iqsync_data, data); + const iqueue_msg_t new_iqmsg = iqueue_msg( + iqueue_msg_offset(iqmsg) + data_offset, + iqueue_msg_len(iqmsg) - data_offset + ); + + while (1) + { + const uint64_t local_index = iqsync_sources_scan( + iqsync, + orig_src, + orig_index + ); + if (local_index == IQUEUE_MSG_BAD_ID) + return 0; + + // Try to store the new entry at the last slot scanned. + // Note that the index entry points to the data section, + // not the iqsync_msg header portion (which will be in the file for + // future reference). + int rc = iqueue_try_update( + iqsync->iq, + local_index, + new_iqmsg + ); + + // We were not successful; rescan the sources and try again + if (rc == IQUEUE_STATUS_HAS_DATA) + { + TSLOGX(TSDEBUG, "%s: Lost race at %"PRIu64" for %"PRIx64".%"PRIu64, + iqsync->local.name, + local_index, + orig_src, + orig_index + ); + continue; + } + + if (rc == IQUEUE_STATUS_SEALED) + { + TSLOGX(TSWARN, "%s: File has been sealed. Stopping sync.", iqueue_name(iqsync->iq)); + return -1; + } + + if (rc != 0) + { + TSLOGX(TSERROR, "%s: Unable to store %zu bytes at %"PRIu64"! rc=%d", + iqsync->local.name, + data_len, + local_index, + rc + ); + return -1; + } + + // We have successfully written at the desired slot, which means + // no new messages arrived while we were consulting the hash tables. + if (iqsync->verbose) + TSLOGX(TSINFO, "%s: Stored remote %"PRIu64" as %"PRIu64, + iqsync->local.name, + remote_index, + local_index + ); + + iqsync->remote.count++; + iqsync->remote.index = remote_index; + + return 1; + } +} + + +static int +iqsync_pull_one_data( + iqsync_t * const iqsync, + iqueue_allocator_t * const allocator +) +{ + const size_t alloc_len = sizeof(struct iqsync_data) + IQUEUE_MSG_MAX; + + iqueue_msg_t iqmsg; + struct iqsync_data * const msg = iqueue_allocate( + allocator, + alloc_len, + &iqmsg + ); + if (!msg) + { + TSLOGX(TSERROR, "%s: Unable to allocate message", iqsync->local.name); + return -1; + } + + ssize_t rlen = tsio_read_all( + iqsync->read_fd, + ((uint8_t*) msg) + sizeof(msg->magic), + sizeof(*msg) - sizeof(msg->magic) + ); + if (rlen < 0) + return -1; // error + if (rlen != sizeof(*msg) - sizeof(msg->magic)) + return 0; // closed fd + + // Fill in the magic header + msg->magic = htobe64(IQSYNC_DATA_MAGIC); + + // If this is just a keep-alive, we have nothing else to process + if (msg->len == 0 && msg->src == 0 && msg->orig_src == 0) + { + iqueue_realloc_bulk(allocator, &iqmsg, alloc_len, 0); + return 1; + } + + const size_t data_len = be32toh(msg->len); + const size_t msg_len = sizeof(*msg) + data_len; + + iqsync->remote.len += data_len; + + if (data_len > IQUEUE_MSG_MAX) + { + TSLOGX(TSERROR, "%s: Message %"PRIu64" len %zu greater than max %zu", + iqsync->remote.name, + be64toh(msg->iq_index), + data_len, + (size_t) IQUEUE_MSG_MAX + ); + return -1; + } + + rlen = tsio_read_all(iqsync->read_fd, msg->data, data_len); + if (rlen < 0) + return -1; // error + if (rlen != (ssize_t) data_len) + return 0; // closed fd + + if (iqueue_realloc_bulk( + allocator, + &iqmsg, + alloc_len, + msg_len + ) < 0) { + TSLOGX(TSERROR, "%s: Unable to resize from %zu to %zu?", + iqsync->local.name, + alloc_len, + msg_len + ); + return -1; + } + + // Now that the message has been fully received into the buffer, + // try to post it to the iqueue. + int rc = iqsync_recv(iqsync, msg, iqmsg); + if (rc == 1) + return 1; + if (rc < 0) + return -1; + + // Too old or from ourselves; discard it, but do not signal an error + iqueue_realloc_bulk(allocator, &iqmsg, alloc_len, 0); + return 1; +} + + +static int +iqsync_pull_one_heartbeat( + iqsync_t * const iqsync +) +{ + iqsync_setup_heartbeats(iqsync, 1); + + struct iqsync_heartbeat msg; + ssize_t rlen; + + rlen = tsio_read_all( + iqsync->read_fd, + ((uint8_t*) &msg) + sizeof(msg.magic_be64), + sizeof(msg) - sizeof(msg.magic_be64) + ); + if (rlen <= 0) + return (int) rlen; + + const uint64_t count = be64toh(msg.count_be64); + if (count > iqsync->heartbeats_max) + TSABORTX("%s: Sent %"PRIu64" heartbeats? Max %u", + iqsync->remote.name, + count, + iqsync->heartbeats_max + ); + + shash_entry_t heartbeats[count]; + rlen = tsio_read_all(iqsync->read_fd, heartbeats, sizeof(heartbeats)); + if (rlen <= 0) + return rlen; + + for (unsigned i = 0 ; i < count ; i++) + { + shash_entry_t * const heartbeat = &heartbeats[i]; + + if (heartbeat->key == 0 || heartbeat->key == ~(uint64_t) 0) + { + TSLOGX(TSWARN, "%s: Sent writer with invalid id/timestamp %"PRIx64":%"PRIx64"?", + iqsync->remote.name, + heartbeat->key, + heartbeat->value + ); + continue; + } + + iqsync_hash_update( + iqsync->heartbeats_hash, + heartbeat->key, + heartbeat->value + ); + } + + TSLOGX(TSDEBUG, "Received %"PRIu64" heartbeats", count); + + return 1; +} + + +/** Read a iqsync_msg and append the data to the local iqueue. + * \return 1 on success, 0 on connection closed, -1 on error. + */ +static int +iqsync_pull_one( + iqsync_t * const iqsync, + iqueue_allocator_t * const allocator +) +{ + uint64_t magic_be64; + ssize_t rlen = tsio_read_all(iqsync->read_fd, &magic_be64, sizeof(magic_be64)); + if (rlen < 0) + return -1; + if (rlen != sizeof(magic_be64)) + return 0; + + const uint64_t magic = be64toh(magic_be64); + if (magic == IQSYNC_DATA_MAGIC) + return iqsync_pull_one_data(iqsync, allocator); + if (magic == IQSYNC_HEARTBEAT_MAGIC) + return iqsync_pull_one_heartbeat(iqsync); + + TSLOGX(TSERROR, "%s: Bad magic %"PRIx64". Unknown type!", + iqsync->remote.name, + magic + ); + return -1; +} + + +/** Send the final part of the handshake to tell the remote side + * to begin sending data. + * + * This requires that the source table be up to date with the sequence + * number that is desired from the remote side; if it is not exactly right + * there is no harm. The remote side will send some messages that will be + * discarded since they are already present in the local iqueue. + */ +static int +iqsync_start_send( + iqsync_t * const iqsync +) +{ + // If there is a writer in the table, we know that we have heard from + // at least that position and can resume there. If there was a mistake + // in computing the starting index, it might be too low and the first + // few incoming packets will be dropped. + iqsync->remote.index = iqsync_sources_get( + iqsync, + iqsync->remote.creation + ); + + // Send the ack asking to start at the next message, or 0 + // if there is no value already recorded for this source. + struct iqsync_start start = { + .magic = htobe64(IQSYNC_START_MAGIC), + .start_index = htobe64(iqsync->remote.index), + .flags = htobe64(0), + }; + + if (tsio_write_all( + iqsync->write_fd, + &start, + sizeof(start) + ) != sizeof(start)) + { + TSLOGX(TSERROR, "%s: Write error on start message", iqsync->remote.name); + return -1; + } + + if (iqsync->verbose) + TSLOGX(TSINFO, "send RX request start at %"PRIu64, iqsync->remote.index); + + return 0; +} + + +/** Read entries from the remote iqueue and write them into our local one. + * + * At this point everything is correctly configured and handshaken, so + * we have exclusive read access to the file descriptor. + */ +static void * +iqsync_pull_thread( + void * const iqsync_ptr +) +{ + iqsync_t * const iqsync = iqsync_ptr; + + // Pre-allocate some space for incoming messages + iqueue_allocator_t allocator; + if (iqueue_allocator_init( + iqsync->iq, + &allocator, + IQUEUE_MSG_MAX * 4, // try to avoid re-filling too often + 1 + ) < 0) + TSABORTX("%s: Unable to create allocator", iqsync->local.name); + + // Read messages until we have an error or a closed connection + int rc; + while ((rc = iqsync_pull_one(iqsync, &allocator)) == 1) + { + // nop + // \todo: check for sealed iqueue + } + + if (rc == 0) + { + if (iqsync->verbose) + TSLOGX(TSINFO, "%s: Connection closed: index %"PRIu64, + iqsync->remote.name, + iqsync->remote.index + ); + } else { + TSLOG(TSERROR, "%s: Read failed: index %"PRIu64, + iqsync->remote.name, + iqsync->remote.index + ); + } + + iqsync->do_shutdown = 1; + close(iqsync->read_fd); + + return NULL; +} + + + +/** Send the handshake message to the remote side. + * This describes the parameters of the iqueue on our side. + */ +static int +iqsync_handshake_send( + iqsync_t * const iqsync +) +{ + iqueue_t * const iq = iqsync->iq; + + iqsync->local.hdr = iqueue_header(iq, &iqsync->local.hdr_len); + iqsync->local.creation = iqueue_creation(iq); + iqsync->local.entries = iqueue_entries(iq); + + if (iqsync->verbose) + TSLOGX(TSINFO, "%s: Source creation=%"PRIu64" entries=%"PRIu64, + iqsync->local.name, + iqsync->local.creation, + iqsync->local.entries + ); + + struct iqsync_handshake handshake = { + .magic = htobe64(IQSYNC_HANDSHAKE_MAGIC), + .creation = htobe64(iqsync->local.creation), + .entries = htobe64(iqsync->local.entries), + .hdr_len = htobe64(iqsync->local.hdr_len), + }; + + // \todo Is this safe? What if hdr is long? Should there be + // a split-phase handshake to ensure that we do not deadlock? + struct iovec iov[] = { + { .iov_base = &handshake, .iov_len = sizeof(handshake) }, + { .iov_base = (void*)(uintptr_t) iqsync->local.hdr, .iov_len = iqsync->local.hdr_len }, + }; + size_t total_len = iov[0].iov_len + iov[1].iov_len; + + ssize_t wlen = tsio_writev_all(iqsync->write_fd, total_len, iov, 2); + if (wlen != (ssize_t) total_len) + { + TSLOGX(TSERROR, + "%s: handshake write failed: %zd != %zu", + iqsync->remote.name, + wlen, + total_len + ); + return -1; + } + + return 0; +} + + +/** Receive a remote handshake message and update our view of the + * remote queue. + */ +static int +iqsync_handshake_recv( + iqsync_t * const iqsync +) +{ + // Read the handshake from the remote side + struct iqsync_handshake reply; + ssize_t rlen = tsio_read_all(iqsync->read_fd, &reply, sizeof(reply)); + if (rlen != sizeof(reply)) + { + TSLOGX(TSERROR, "%s: handshake read failed", iqsync->remote.name); + return -1; + } + + const uint64_t remote_magic = be64toh(reply.magic); + if (remote_magic != IQSYNC_HANDSHAKE_MAGIC) + { + TSLOGX(TSERROR, "%s: bad handshake magic: %"PRIu64" != %"PRIu64, + iqsync->remote.name, + remote_magic, + IQSYNC_HANDSHAKE_MAGIC + ); + return -1; + } + + iqsync->remote.creation = be64toh(reply.creation); + iqsync->remote.entries = be64toh(reply.entries); + iqsync->remote.hdr_len = be64toh(reply.hdr_len); + + if (iqsync->remote.hdr_len == 0) + return 0; + + iqsync->remote.hdr = malloc(iqsync->remote.hdr_len); + if (!iqsync->remote.hdr) + TSABORT("hdr alloc failed: %"PRIu64" bytes", iqsync->remote.hdr_len); + + if (tsio_read_all( + iqsync->read_fd, + iqsync->remote.hdr, + iqsync->remote.hdr_len + ) != (ssize_t) iqsync->remote.hdr_len) + { + TSLOGX(TSERROR, "read of remote header failed"); + return -1; + } + + return 0; +} + + + +/** Receive the remote handshake and create the local iqueue based on the + * remote parameters. + */ +static int +iqsync_handshake_clone( + iqsync_t * const iqsync +) +{ + TSLOGX(TSINFO, "%s: Not present; cloning from remote %s", + iqsync->local.name, + iqsync->remote.name + ); + + // Receive the remote handshake message before sending ours + if (iqsync_handshake_recv(iqsync) < 0) + return -1; + + iqsync->local.hdr_len = iqsync->remote.hdr_len; + + if (iqsync->iq) + { + TSLOGX(TSINFO, "%s: Using existing local iqueue", iqueue_name(iqsync->iq)); + } else + { + const uint64_t local_creation = tsclock_getnanos(0); + iqsync->iq = iqueue_create( + iqsync->local.name, + local_creation, + iqsync->remote.hdr, + iqsync->remote.hdr_len + ); + + if (!iqsync->iq) + { + TSLOGX(TSERROR, "%s: Unable to open", iqsync->local.name); + return -1; + } + if (iqueue_creation(iqsync->iq) != local_creation) + { + TSLOGX(TSWARN, "%s: already exists; verify header?", iqsync->local.name); + } + } + + // Exchange handshake messages now that we have an iqueue created + if (iqsync_handshake_send(iqsync) < 0) + return -1; + + return 0; +} + + +/** Both sides should exist; do the normal exchange */ +static int +iqsync_handshake_normal( + iqsync_t * const iqsync +) +{ + // only get write access if we are pulling i + const bool writable = iqsync->do_pull ? true : false; + + if (iqsync->iq != NULL) + { + iqsync->local.name = iqueue_name(iqsync->iq); + } + else + { + iqsync->iq = iqueue_open(iqsync->local.name, writable); + if (!iqsync->iq) + { + TSLOGX(TSERROR, "%s: Unable to open", iqsync->local.name); + return -1; + } + iqsync->close_iq_on_shutdown = true; + } + + // Exchange handshake messages + if (iqsync_handshake_send(iqsync) < 0) + return -1; + if (iqsync_handshake_recv(iqsync) < 0) + return -1; + + if (!iqsync->do_hdr_validate) + return 0; + + if (iqsync->remote.hdr_len != iqsync->local.hdr_len) + { + TSLOGX(TSERROR, "%s: remote header %"PRIu64" bytes != local %"PRIu64, + iqsync->remote.name, + iqsync->remote.hdr_len, + iqsync->local.hdr_len + ); + return -1; + } + + if (memcmp(iqsync->remote.hdr, iqsync->local.hdr, iqsync->local.hdr_len) != 0) + { + TSLOGX(TSERROR, "Remote header"); + TSHDUMP(TSERROR, iqsync->remote.hdr, iqsync->remote.hdr_len); + TSLOGX(TSERROR, "Local header"); + TSHDUMP(TSERROR, iqsync->local.hdr, iqsync->local.hdr_len); + return -1; + } + + return 0; +} + + +static void * +iqsync_send_hb_thread( + void * const iqsync_ptr +) +{ + iqsync_t * const iqsync = iqsync_ptr; + + const struct iqsync_data msg = { + .magic = htobe64(IQSYNC_DATA_MAGIC), + .src = 0, + .orig_src = 0, + .len = 0, + }; + + while (!iqsync->do_shutdown) + { + sleep(1); + + ssize_t wlen = write(iqsync->write_fd, &msg, sizeof(msg)); + if (wlen == sizeof(msg)) + continue; + + TSLOG(TSERROR, "%s: Short write", iqsync->remote.name); + break; + } + + iqsync->do_shutdown = 1; + close(iqsync->write_fd); + + return NULL; +} + + +static void * +iqsync_recv_hb_thread( + void * const iqsync_ptr +) +{ + iqsync_t * const iqsync = iqsync_ptr; + + while (!iqsync->do_shutdown) + { + struct iqsync_data msg; + ssize_t rlen = read(iqsync->read_fd, &msg, sizeof(msg)); + if (rlen != sizeof(msg)) + { + TSLOG(TSERROR, "%s: Short read?", iqsync->remote.name); + break; + } + + if (msg.magic == htobe64(IQSYNC_DATA_MAGIC) + && msg.len == 0 + && msg.src == 0 + ) + continue; + + TSLOGX(TSWARN, "%s: Sent non-empty heartbeat?", iqsync->remote.name); + break; + } + + iqsync->do_shutdown = 1; + close(iqsync->read_fd); + + return NULL; +} + + +static void * +iqsync_init_thread( + void * const iqsync_ptr +) +{ + iqsync_t * const iqsync = iqsync_ptr; + + // Rescan to build our table of sources; we don't care about what + // we find, so we ignore the result and don't look for anything in + // particular. We only do this if we are pulling; push only mode + // does not need to scan since the remote side will tell us where + // to start. This saves walking the entire file for a read-only mode + if (iqsync->do_pull) + { + // If the file has been iqsync'ed already, then the sources + // will be updated into the writer table and not much should need to + // be scanned. Bring things up to date with the end of the file + // just in case. + iqsync_sources_setup(iqsync); + iqsync_sources_scan_all(iqsync, 0, NULL); + + // Handshake and scan done, exchange start messages + + if (iqsync_start_send(iqsync) < 0) { + iqsync->do_shutdown = 1; + return NULL; + } + } + + // Handshake and scan done, exchange start messages + if (iqsync->do_push + && iqsync_start_recv(iqsync) < 0) { + iqsync->do_shutdown = 1; + return NULL; + } + + // Start the clock + iqsync->start_time = iqsync->report_time = tsclock_getnanos(0); + + // We create a pull thread no matter what; it will just monitor the + // socket to detect a close. The pull thread doesn't do any + // CPU pinning since it is spending all of its time in a read + if (pthread_create( + &iqsync->pull_thread, + NULL, + iqsync->do_pull ? iqsync_pull_thread : iqsync_recv_hb_thread, + iqsync + ) < 0) + TSABORTX("Unable to create pull thread"); + + // Likewise, the report thread is sleeping most of the time so it does + // not do any cpu pinning. + if (iqsync->report_interval + && pthread_create(&iqsync->stat_thread, NULL, iqsync_stat_thread, iqsync) < 0) + TSABORTX("Unable to create stats thread"); + + // TODO: It would be better to do the work above on the pinned cpu, but the + // spawned threads would inherit the affinity mask. This could be + // re-factored to workj better (hold the threads or saved the mask). + if (iqsync->local_cpu) + { + char * end; + int cpu = strtoul(iqsync->local_cpu, &end, 0); + if (end == iqsync->local_cpu) + TSABORTX("Unable to parse local cpu '%s'", iqsync->local_cpu); + if (tssched_set_thread_affinity(pthread_self(), cpu) < 0) + TSABORT("Unable to set cpu affinity to %d", cpu); + TSLOGX(TSINFO, "Pinned push thread to cpu %d", cpu); + } + + if (iqsync->do_push) + return iqsync_push_thread(iqsync); + else + return iqsync_send_hb_thread(iqsync); +} + + +/** Start the iqsync handshake process and spin off the send/receive + * threads that handle the exchange of data (depending on --push / --pull). + * If --report-interval is specified a stat reporting thread will also + * be created. + * + * \todo Take advantage of atomic creation code. + */ +int +iqsync_start( + iqsync_t * const iqsync +) +{ + // Check for an existing file + struct stat statbuf; + if (iqsync->do_clone + && stat(iqsync->local.name, &statbuf) < 0) + { + if (errno != ENOENT) + { + TSLOG(TSERROR, "%s: Unable to stat", iqsync->local.name); + return -1; + } + + if (iqsync_handshake_clone(iqsync) < 0) + return -1; + } else { + if (iqsync_handshake_normal(iqsync) < 0) + return -1; + } + + if (iqsync->do_prefetch + && iqueue_prefetch_thread(iqsync->iq, + &iqsync->prefetch_thread) != 0) + return -1; + + if (iqsync->do_syncbehind + && iqueue_syncbehind_thread(iqsync->iq, + &iqsync->syncbehind_thread) != 0) + return -1; + + // And kick off the threads to do the real work (the init thread + // will become the push thread) + if (pthread_create( + &iqsync->push_thread, + NULL, + iqsync_init_thread, + iqsync + ) < 0) + TSABORTX("Unable to create push thread"); + + return 0; +} + + +int +iqsync_wait( + iqsync_t * const iqsync +) +{ + // Wait for the thread to exit + pthread_join(iqsync->push_thread, NULL); + pthread_join(iqsync->pull_thread, NULL); + + if (iqsync->report_interval) + pthread_cancel(iqsync->stat_thread); + + if (!iqsync->do_server) + iqsync_stats(iqsync); + + if (iqsync->verbose) + TSLOGX(TSINFO, "Exiting"); + + if (iqsync->close_iq_on_shutdown) + iqueue_close(iqsync->iq); + + close(iqsync->read_fd); + close(iqsync->write_fd); + + return 0; +} diff --git a/src/iqueue-main.c b/src/iqueue-main.c new file mode 100644 index 0000000..d6f58fa --- /dev/null +++ b/src/iqueue-main.c @@ -0,0 +1,726 @@ +/* $TwoSigma: iqueue-main.c,v 1.17 2012/02/02 20:48:52 thudson Exp $ */ + +/* + * Copyright (c) 2010 Two Sigma Investments, LLC + * All Rights Reserved + * + * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF + * Two Sigma Investments, LLC. + * + * The copyright notice above does not evidence any + * actual or intended publication of such source code. + */ +#include "twosigma.h" +#include +#include +#include +#include +#include +#include +#include "tsutil.h" +#include "tsio.h" +#include "tsutil_dcat.h" +#include "iqueue.h" +#include "iqueue_cpio.h" +#include "tsclock.h" +#include "segfault.h" + +static struct option long_options[] = { + { "help", no_argument, 0, '?' }, + { "iqueue", required_argument, 0, 'f' }, + { "create", no_argument, 0, 'C' }, + { "header", no_argument, 0, 'H' }, + { "stats", no_argument, 0, 's' }, + { "watch", no_argument, 0, 'w' }, + { "append", no_argument, 0, 'a' }, + { "line", no_argument, 0, 'l' }, + { "follow", no_argument, 0, 'F' }, + { "seal", no_argument, 0, 'S' }, + { "archive", no_argument, 0, 'A' }, + { "zero", no_argument, 0, '0' }, + { "binary", no_argument, 0, 'b' }, + { "no-header", no_argument, 0, 'N' }, + { "copyin", no_argument, 0, '1' }, + { "copyout", no_argument, 0, '2' }, + { "writer", required_argument, 0, 'W' }, + { "print-entry", required_argument, 0, 'n' }, + { "begin", required_argument, 0, 'B' }, + { "end", required_argument, 0, 'E' }, + { "debug", required_argument, 0, 'd' }, + { 0, 0, 0, 0}, +}; + +static void +__attribute__((noreturn)) +usage( + FILE * stream, + const char * msg +) +{ + static const char usage_str[] = +"Usage:\n" +" iqueue [options...]\n" +"\n" +"Options:\n" +" -h | -? | --help This help\n" +"\n" +"General options:\n" +" -f | --iqueue /path/file iqueue file (.iqx) to update\n" +" -C | --create Initialize the message log\n" +" -H | --header Read a user header from stdin (for create)\n" +" -s | --stats Print stats about the queue\n" +" -w | --watch Print stats periodically\n" +" -a | --append Read a message from stdin and append it\n" +" -W | --writer N,V Set writer N heartbeat value to V\n" +" -l | --line Read a new message per line and append them\n" +" -F | --follow Print new messages as they are added\n" +" or re-open a sealed iqueue in --line append\n" +" -S | --seal Seal the iqueue from further writes\n" +" -A | --archive When sealing, archive the file\n" +" -N | --no-header Do not print the user header\n" +" -n | --print-entry N Print only entry number N\n" +" -B | --begin N Start from entry N\n" +" -E | --end N End with entry N\n" +" -d | --debug N Debug entry N (or the entire queue if N==-1)\n" +" -b | --binary Print binary messages\n" +" -0 | -z | --zero Print nul-separated messages in ascii mode\n" +"\n" +"DCAT formatting:\n" +"\n" +"For archiving and version transformations, iqueues can be transformed into\n" +"a DC formatted file with --copyout. The user header will be populated\n" +"into the DC user header, and restored when a new iqueue is created.\n" +"\n" +" --copyout Output in a form suitable for --copyin\n" +" --copyin Read in the form output by --copyout,\n" +" implies --create, and will copy user header\n" +" from dcat file.\n" +; + + fprintf(stream, "%s%s", msg, usage_str); + exit(EXIT_FAILURE); +} + + +static ssize_t +read_all( + int fd, + uint8_t * buf, + size_t len +) +{ + size_t offset = 0; + + while (offset < len) + { + ssize_t rc = read(fd, buf+offset, len-offset); + if (rc < 0) + { + TSLOG(TSERROR, "read failed"); + return -1; + } + + offset += rc; + + if (rc == 0) + return offset; + } + + TSLOGX(TSERROR, "message too long! limit is %zu bytes", len); + return -1; +} + + +static int +iqueue_append_one( + iqueue_t * const iq, + int fd, + int zero_out +) +{ + if (iqueue_is_sealed(iq)) + TSABORTX("can not append: iqueue is sealed"); + + uint8_t * const buf = calloc(1, IQUEUE_MSG_MAX); + const size_t max_size = zero_out ? IQUEUE_MSG_MAX - 1 : IQUEUE_MSG_MAX; + ssize_t len = read_all(fd, buf, max_size); + if (len < 0) + TSABORTX("Error reading from stdin!"); + if (zero_out) + buf[len++] = '\0'; + + int rc = iqueue_append(iq, buf, len); + free(buf); + + if (rc != 0) + { + TSLOGX(TSERROR, "%s: Update failed rc=%d", iqueue_name(iq), rc); + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} + + +/** Read a line with either a newline or zero separator. + * + * On end of file, the last line will be returned, with or without a separator. + * + * \return the number of bytes read, including the separator. + */ +static ssize_t +read_entire_line( + const int fd, + char * const buf, + const size_t len, + const char separator +) +{ + size_t off = 0; + + while (off < len-1) + { + ssize_t rlen = read(fd, &buf[off], 1); + if (rlen < 0) + return rlen; + + if (rlen == 0) + { + // Closed file and no partial line + if (off == 0) + return -1; + + // Partial line read + off++; + break; + } + + // Check for end of line + if (separator == buf[off++]) + break; + } + + // nul terminate the string, just in case and return the length + // read, including the separator + buf[off] = '\0'; + return off; +} + + +/** Convert a text string to binary and append */ +static int +iqueue_append_octdump( + iqueue_t * const iq, + const char * const buf, + const size_t len +) +{ + iqueue_msg_t iqmsg; + size_t out_len = 0; + uint8_t * const m = iqueue_allocate_raw(iq, len, &iqmsg); + + for (size_t i = 0 ; i < len ; i++) + { + char c = buf[i]; + if (c != '\\') + { + m[out_len++] = c; + continue; + } + + if (buf[i+1] == '\\') + { + m[out_len++] = '\\'; + i++; + continue; + } + + // Convert from an octal dump to hex + uint8_t o = 0; + o = (buf[++i] - '0') | (o << 3); + o = (buf[++i] - '0') | (o << 3); + o = (buf[++i] - '0') | (o << 3); + + m[out_len++] = o; + } + + // Resize the iqmsg, throwing away some space at the end + iqmsg = iqueue_msg(iqueue_msg_offset(iqmsg), out_len); + + return iqueue_update(iq, iqmsg, NULL); +} + + +static int +iqueue_copyin_from_stdin( + const char * const filename, + const bool do_follow +) +{ + return iqueue_create_and_copyin_from_file(filename, do_follow, stdin); +} + +static int +iqueue_append_line( + iqueue_t * const iq, + const int fd, + const bool zero_separator, + const bool do_follow +) +{ + if (iqueue_is_sealed(iq)) + { + if (!do_follow) + TSABORTX("%s: can not append: iqueue is sealed", iqueue_name(iq)); + if (iqueue_reopen_wait(iq) < 0) + TSABORTX("%s: reopen failed", iqueue_name(iq)); + } + + char * const buf = calloc(1, IQUEUE_MSG_MAX); + + while (1) + { + int rc; + ssize_t len = read_entire_line(fd, buf, IQUEUE_MSG_MAX, zero_separator ? '\0' : '\n'); + if (len == -1) + break; + + // Do not include any newlines if we have them + if (zero_separator) + len--; + else + if (buf[len-1] == '\n') + buf[--len] = '\0'; + +retry: + rc = iqueue_append_octdump(iq, buf, len); + if (rc == 0) + continue; + + if (rc != IQUEUE_STATUS_SEALED && !do_follow) + { + TSLOGX(TSERROR, "%s: Update failed rc=%d", iqueue_name(iq), rc); + return EXIT_FAILURE; + } + + // We have a sealed iqueue; try to reopen until we have a new one + if (iqueue_reopen_wait(iq) < 0) + { + TSLOG(TSERROR, "%s: Unable to reopen", iqueue_name(iq)); + return EXIT_FAILURE; + } + + goto retry; + } + + free(buf); + + return EXIT_SUCCESS; +} + + +static int +iqueue_stats_output( + iqueue_t * const iq +) +{ + printf("%s:" + " %"PRIu64" (0x%"PRIx64")" + " data=%"PRIu64 + " entries=%"PRIu64 + "%s" + "\n", + iqueue_name(iq), + iqueue_creation(iq), + iqueue_creation(iq), + iqueue_data_len(iq), + iqueue_entries(iq), + iqueue_is_sealed(iq) ? " sealed" : "" + ); + + for (unsigned id = 0 ; id < 4 ; id++) + { + shash_t * const sh = iqueue_writer_table(iq, id, 0); + if (!sh) + continue; + + unsigned max_entries; + const shash_entry_t * const table = shash_entries(sh, &max_entries); + + printf("Writers %u:\n", id); + + for (unsigned i = 0 ; i < max_entries ; i++) + { + const shash_entry_t * const writer = &table[i]; + if (writer->key == 0) + break; + printf(" %"PRIx64": %"PRIu64"\n", + writer->key, + writer->value + ); + } + } + + return 0; +} + + +static int +iqueue_watch_output( + iqueue_t * const iq, + const unsigned sleep_us +) +{ + uint64_t old_time = tsclock_getnanos(0); + uint64_t old_entries = iqueue_entries(iq); + + printf("%s: creation %"PRIu64" (0x%"PRIx64"): %"PRIu64" entries\n", + iqueue_name(iq), + iqueue_creation(iq), + iqueue_creation(iq), + old_entries + ); + + while (1) + { + usleep(sleep_us); + const uint64_t new_time = tsclock_getnanos(0); + const uint64_t new_entries = iqueue_entries(iq); + if (new_entries == old_entries) + { + if (iqueue_is_sealed(iq)) + break; + continue; + } + + printf("%s: %"PRIu64" entries (%.0f entries/sec)\n", + iqueue_name(iq), + new_entries, + (new_entries - old_entries) * 1.0e9 / (new_time - old_time) + ); + + old_time = new_time; + old_entries = new_entries; + } + + TSLOGX(TSINFO, "%s: iqueue has been sealed", iqueue_name(iq)); + + return 0; +} + + +static int +iqueue_seal_and_archive( + iqueue_t * const iq, + const int do_archive +) +{ + const char * const old_name = iqueue_name(iq); + int rc = iqueue_seal(iq); + if (rc != 0) + { + TSLOGX(TSERROR, "%s: Unable to seal: %s", + old_name, + rc == IQUEUE_STATUS_SEALED ? "already sealed" : + rc == IQUEUE_STATUS_INDEX_INVALID ? "invalid index" : + "unknown error" + ); + return EXIT_FAILURE; + } + + if (!do_archive) + return EXIT_SUCCESS; + + if (iqueue_archive(iq, IQUEUE_MSG_BAD_ID) < 0) + return EXIT_FAILURE; + + TSLOGX(TSINFO, "%s: archived", iqueue_name(iq)); + + return EXIT_SUCCESS; +} + + +static int +iqueue_update_writer( + iqueue_t * const iq, + const char * const writer_flag +) +{ + char * end; + const uint64_t id = strtoul(writer_flag, &end, 0); + if (!end || end[0] != ',' || end[1] == '\0') + usage(stderr, "Unable to parse writer, must be 'N,V'\n"); + + const uint64_t value = strtoul(end+1, &end, 0); + if (!end || end[0] != '\0') + usage(stderr, "Unable to parse value, must be 'N,V'\n"); + + const unsigned table_id = 0; + shash_t * const sh = iqueue_writer_table(iq, table_id, 1); + if (!sh) + TSABORTX("%s: Unable to create/retrieve write table %u?", + iqueue_name(iq), + table_id + ); + + shash_entry_t * writer = shash_insert(sh, id, value); + if (writer) + { + // Writer did not exist; we are done. + TSLOGX(TSINFO, "%s: Writer %u.0x%"PRIx64" value %"PRIu64, + iqueue_name(iq), + table_id, + id, + value + ); + return 0; + } + + // Writer already existed. Retrieve it and try an update + writer = shash_get(sh, id); + if (!writer) + TSABORTX("%s: Writer %u.0x%"PRIx64" should exist?", + iqueue_name(iq), + table_id, + id + ); + + if (iqueue_writer_update(sh, writer, value)) + { + TSLOGX(TSINFO, "%s: Writer %u.0x%"PRIx64" value %"PRIu64, + iqueue_name(iq), + table_id, + id, + value + ); + return 0; + } + + TSLOGX(TSWARN, "%s: Writer %u.0x%"PRIx64" tried to write %"PRIu64", current value %"PRIu64, + iqueue_name(iq), + table_id, + id, + value, + writer->value + ); + + return 0; +} + + +int +main( + int argc, + char **argv +) +{ + segfault_handler_install(); + + const char * iqueue_file = NULL; + int create_flag = 0; + int writable = 0; + uint64_t print_entry = -1; + uint64_t debug_entry = -2; + uint64_t begin_entry = 0; + uint64_t end_entry = -1; + int option_index = 0; + int append = 0; + int append_line = 0; + int follow = 0; + int do_seal = 0; + int do_archive = 0; + int binary_out = 0; + int zero_out = 0; + int header = 1; + int do_stats = 0; + int do_watch = 0; + int read_header = 0; + int copyin_flag = 0; + int copyout_flag = 0; + const char * writer_flag = NULL; + + while (1) + { + int c = getopt_long( + argc, + argv, + "h?f:CGaFz0bAHI:D:n:B:E:swNl12W:", + long_options, + &option_index + ); + + if (c == -1) + break; + + switch (c) + { + case 0: break; + default: usage(stderr, ""); break; + case 'h': case '?': usage(stdout, ""); break; + + // Messagebox options + case 'f': iqueue_file = optarg; break; + case 'C': create_flag = 1; break; + case 'n': print_entry = strtoul(optarg, NULL, 0); break; + case 'd': debug_entry = strtoul(optarg, NULL, 0); break; + case 'B': begin_entry = strtoul(optarg, NULL, 0); break; + case 'E': end_entry = strtoul(optarg, NULL, 0); break; + case 'a': append = 1; writable = true; break; + case 'l': append_line = 1; writable = true; break; + case 'F': follow = 1; break; + case 'S': do_seal = 1; writable = true; break; + case 'A': do_archive = 1; break; + case 'z': + case '0': zero_out = 1; break; + case 'b': binary_out = 1; break; + case 'N': header = 0; break; + case 'H': read_header = 1; break; + case 's': do_stats = 1; break; + case 'w': do_watch = 1; break; + case '1': copyin_flag = 1; writable = true; break; + case '2': copyout_flag = 1; break; + case 'W': writer_flag = optarg; writable = true; break; + } + } + + if (!iqueue_file) + usage(stderr, "iqueue file must be specified!\n"); + if (argc != optind) + usage(stderr, "Extra arguments?\n"); + + uint8_t * user_hdr = NULL; + size_t user_hdr_len = 0; + + if (read_header) + { + if (create_flag != 1) + usage(stderr, "--read-header is not useful unless creating\n"); + user_hdr = alloca(65536); + user_hdr_len = read_all(STDIN_FILENO, user_hdr, 65536); + if (user_hdr_len == (size_t) -1) + TSABORTX("Error reading user header"); + } + + if (copyin_flag) + return iqueue_copyin_from_stdin(iqueue_file, follow); + + if (create_flag) + { + const uint64_t creation = tsclock_getnanos(0); + iqueue_t * const iq = iqueue_create( + iqueue_file, + creation, + user_hdr, + user_hdr_len + ); + if (!iq) + TSABORTX("%s: Unable to create", iqueue_file); + if (iqueue_creation(iq) != creation) + TSLOGX(TSINFO, "%s: iqueue already existed", iqueue_file); + return EXIT_SUCCESS; + } + + iqueue_t * const iq = iqueue_open(iqueue_file, writable); + if (!iq) + TSABORTX("Failed to %s %s", + create_flag == 1 ? "create" : "open", + iqueue_file + ); + + if (writer_flag) + return iqueue_update_writer(iq, writer_flag); + + if (do_stats) + return iqueue_stats_output(iq); + if (do_watch) + return iqueue_watch_output(iq, 1e6); + if (copyout_flag) + return iqueue_copyout(iq, follow, begin_entry, end_entry); + if (append) + return iqueue_append_one(iq, STDIN_FILENO, zero_out); + if (append_line) + return iqueue_append_line(iq, STDIN_FILENO, zero_out, follow); + + if (do_seal) + return iqueue_seal_and_archive(iq, do_archive); + else + if (do_archive) + return iqueue_archive(iq, IQUEUE_MSG_BAD_ID); + + if (debug_entry != (uint64_t) -2) + { + iqueue_debug(iq, debug_entry); + return EXIT_SUCCESS; + } + + // Print the user header on the file, if there is one + if (header) + { + size_t hdr_len; + const uint8_t * const hdr_buf = iqueue_header(iq, &hdr_len); + size_t offset = 0; + while (offset < hdr_len) + { + ssize_t wlen = write( + STDOUT_FILENO, + hdr_buf + offset, + hdr_len - offset + ); + if (wlen <= 0) + TSABORT("header write failed"); + offset += wlen; + } + } + + uint64_t id = begin_entry; + if (print_entry != (uint64_t) -1) + id = print_entry; + + while (1) + { + if (end_entry != (uint64_t) -1 && id > end_entry) + break; + size_t len; + const uint8_t * data = iqueue_data(iq, id, &len); + if (!data) + { + if (!follow) + break; + if (iqueue_is_sealed(iq)) + break; + usleep(100); + continue; + } + + id++; + + if (binary_out) + { + ssize_t wlen = tsio_write_all(STDOUT_FILENO, data, len); + if ((size_t) wlen != len) + TSABORT("write failed"); + } else { + // ASCII output + for (uint64_t i = 0 ; i < len ; i++) + { + uint8_t c = data[i]; + if (c == '\\') + printf("\\\\"); + else + if (isprint(c)) // || isspace(c)) + printf("%c", c); + else + printf("\\%03o", c); + } + + printf("%c", zero_out ? '\0' : '\n'); + } + + // If they have called us with print entry, we are done + if (print_entry != (uint64_t) -1) + break; + } + + iqueue_close(iq); + return 0; +} diff --git a/src/iqueue.c b/src/iqueue.c new file mode 100644 index 0000000..a34ef0a --- /dev/null +++ b/src/iqueue.c @@ -0,0 +1,2041 @@ +/* $TwoSigma: iqueue.c,v 1.30 2012/01/05 21:45:21 thudson Exp $ */ + +/* + * Copyright (c) 2010 Two Sigma Investments, LLC + * All Rights Reserved + * + * THIS IS UNPUBLISHED PROPRIETARY SOURCE CODE OF + * Two Sigma Investments, LLC. + * + * The copyright notice above does not evidence any + * actual or intended publication of such source code. + */ + +#include "twosigma.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tsutil.h" +#include "tsclock.h" +#include "iqueue.h" +#include "iqsync.h" +#include "atomic.h" +#include "tslock.h" +#include "shash.h" + +__RCSID("$TwoSigma: iqueue.c,v 1.30 2012/01/05 21:45:21 thudson Exp $"); + + +#define IQUEUE_INDEX_MAGIC ((uint32_t) 0xDADA1234) +#define IQUEUE_VERSION ((uint32_t) 0x00003000) +#define IQUEUE_BLOCK_MAGIC ((uint64_t) 0x6971626c6f636b00) +#define IQUEUE_MAX_HDR_LEN ((size_t) 4096) + +#define IQUEUE_TABLE_SHIFT 20 +#define IQUEUE_TABLE_MASK ((1 << IQUEUE_TABLE_SHIFT) - 1) +#define IQUEUE_TABLE_SIZE (1 << 20) + +#define IQUEUE_WRITER_TABLES 4 + +#define IQUEUE_BLOCK_COUNT (1024) + + +typedef struct +{ + const uint32_t magic; + const uint32_t version; + const uint64_t creation_time; + volatile uint64_t flock_time; // for growing the file + + // pointers to/size of the writer tables. Must be at least + // 8-byte aligned to ensure that it does not cross a cache line. + const iqueue_msg_t writer_tables[IQUEUE_WRITER_TABLES] + __attribute__((aligned(8))); + + volatile iqueue_id_t index_tail __attribute__((aligned(64))); + volatile uint64_t data_tail __attribute__((aligned(64))); + + const uint64_t hdr_len; + uint8_t hdr[IQUEUE_MAX_HDR_LEN]; + + volatile uint64_t tables[] __attribute__((aligned(4096))); +} __attribute__((packed)) +iqueue_index_t; + +// Ensure that the writer table has not pushed the index to the wrong +// location. +static_assert(offsetof(iqueue_index_t, index_tail) == 64, "offset error"); + + +typedef struct +{ + const uint64_t magic; + const uint64_t offset; + const uint64_t creation_time; + uint64_t reserved; +} __attribute__((packed)) +iqueue_block_t; + + +struct _iqueue +{ + iqueue_index_t * idx; // Typically data[0] + int fd; + const char * const name; + + // Cache the last index used so that the tables do not need to + // be consulted each time. + // Value is 20 bits of table ID and 44 bits of offset + volatile uint64_t table_cache; + + // Flags used for mapping new blocks + int open_flags; + int open_mode; + int mmap_prot; + int mmap_flags; + int mlock_flag; + + // Last size that we grew the file to + uint64_t last_grow_size; + + // Set once we have warned on an attempt to allocate + int warned_readonly_allocate; + + // iqueue writer tables + shash_t * writer_tables[IQUEUE_WRITER_TABLES]; + + // Prefetch thread + pthread_t prefetch_thread; + pthread_t syncbehind_thread; + sem_t prefetch_sem; + + uint8_t * volatile blocks[IQUEUE_BLOCK_COUNT]; +}; + + + +/** Attempt to lock the iqx file. + * This needs to deal with several problems in the flock(2) interface: + * 1. The locks are recursive + * 2. The locks are per-fd + * 3. The locks are shared across forks + * + * \return -1 on error, 0 on no lock (but progress) and the lock time on a successful lock. + */ +static uint64_t +_iqueue_flock( + iqueue_t * const iq +) +{ + if (iq->last_grow_size == 0) + return 1; + + iqueue_index_t * const idx = iq->idx; + + // flock the file, which might succeed if we share the fd with + // the process that has it locked, in which case we can then check + // the flock time field. + if (flock(iq->fd, LOCK_EX) < 0) + return -1; + + // Check to see the file lock time + uint64_t now = tsclock_getnanos(0); + uint64_t old_flock_time = atomic_cas_64(&idx->flock_time, 0, now); + + // If there was no file lock time set, and we wrote our time to it, + // then we have the locks and are ready to proceed. + if (old_flock_time == 0) + return now; + + TSLOGX(TSINFO, "%s: lock held since %"PRIu64" (now=%"PRIu64")", + iq->name, + old_flock_time, + now + ); + + // Spin for up to 1 msec or until the flock_time changes. + const uint64_t flock_timeout = 1000000; + while (now < old_flock_time + flock_timeout) + { + // If it changes, that means that another thread in our process + // has finished its business and the lock conditions should be + // rechecked to see if it even matters any more. + if (idx->flock_time != old_flock_time) + { + flock(iq->fd, LOCK_UN); + TSLOGX(TSDEBUG, "%s: Lock is available again", iq->name); + return 0; + } + + usleep(10); + now = tsclock_getnanos(0); + } + + // Someone else has held the lock for more than our timeout, + // which means they are likely dead. Steal the lock from + // them if we can. + if (atomic_cas_bool_64(&idx->flock_time, old_flock_time, now)) + { + TSLOGX(TSWARN, "%s: Stole lock after timeout", iq->name); + return now; + } + + flock(iq->fd, LOCK_UN); + TSLOGX(TSDEBUG, "%s: Lock is available again", iq->name); + return 0; +} + + +static void +_iqueue_funlock( + iqueue_t * const iq, + const uint64_t lock_time +) +{ + if (iq->last_grow_size == 0) + return; + + if (!atomic_cas_bool_64(&iq->idx->flock_time, lock_time, 0)) + TSLOGX(TSWARN, "%s: Lock was stolen from us! Danger!", iq->name); + + flock(iq->fd, LOCK_UN); +} + + + +/** Grow the data file to be at least as large as the new_size. + * + * We can't just do an ftruncate() to the new size since there may + * be multiple processes writing to the file and that could lead + * to some nasty races. + * + * \return -1 on error, 0 on success. + */ +static int +iqueue_grow_file( + iqueue_t * const iq, + const uint64_t new_size +) +{ + struct stat sb; +retry: + if (fstat(iq->fd, &sb) < 0) + goto fail; + + // Once we (or someone) have been successful in growing the file, + // we're done and can return. + if (sb.st_size >= (off_t) new_size) + { + TSLOGX(TSDEBUG, "%s: File is already %"PRIu64" bytes >= %"PRIu64, + iq->name, + (uint64_t) sb.st_size, + new_size + ); + return 0; + } + + // For the first block, we can't lock since the file doesn't exist. + const uint64_t flock_time = _iqueue_flock(iq); + if (flock_time == (uint64_t) -1) + goto fail; + if (flock_time == 0) + goto retry; + + // Double check the file size, just in case + // in between us checking the size and then getting the lock, + // someone else has grown the file. + if (fstat(iq->fd, &sb) < 0) + { + _iqueue_funlock(iq, flock_time); + goto fail; + } + + if (sb.st_size >= (off_t) new_size) + { + _iqueue_funlock(iq, flock_time); + TSLOGX(TSDEBUG, "%s: Someone else grew the file to %"PRIu64, + iq->name, + new_size + ); + return 0; + } + + + TSLOGX(TSINFO, "%s: Growing from 0x%"PRIx64" to 0x%"PRIx64" bytes", + iq->name, + (uint64_t) sb.st_size, + new_size + ); + + if (ftruncate(iq->fd, new_size) < 0) + { + _iqueue_funlock(iq, flock_time); + goto fail; + } + + _iqueue_funlock(iq, flock_time); + iq->last_grow_size = new_size; // possible race, but doesn't matter + return 0; + +fail: + TSLOG(TSERROR, "%s: Failed to grow to %"PRIu64" bytes", + iq->name, + new_size + ); + + return -1; +} + + +/** Call mlock() on a single block */ +static int +iqueue_mlock_block( + iqueue_t * const iq, + const uint64_t block_id +) +{ + if (block_id >= IQUEUE_BLOCK_COUNT) + { + TSLOGX(TSWARN, "%s: Block %"PRIu64" out of range", iq->name, block_id); + return -1; + } + + void * const block = iq->blocks[block_id]; + if (!block) + return 0; + + if (mlock(block, IQUEUE_BLOCK_SIZE) == 0) + return 0; + + TSLOG(TSWARN, "%s: Unable to mlock(block[%"PRIu64"]=%p,0x%"PRIx64")", + iq->name, + block_id, + block, + IQUEUE_BLOCK_SIZE + ); + + iq->mlock_flag = 0; + + return -1; +} + + + +/** Sync a single block */ +static int +iqueue_fsync_block( + iqueue_t * const iq, + const uint64_t block_id +) +{ + if (block_id >= IQUEUE_BLOCK_COUNT) + { + TSLOGX(TSWARN, "%s: Block %"PRIu64" out of range", iq->name, block_id); + return -1; + } + + void * const block = iq->blocks[block_id]; + if (!block) + return -1; + + const uint64_t block_offset = block_id << IQUEUE_BLOCK_SHIFT; + + // First sync contents of block to ensure any dirty pages in our mapping + // are saved back to the file + if (sync_file_range( + iq->fd, + block_offset, + IQUEUE_BLOCK_SIZE, + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE) != 0) { + TSLOG(TSWARN, "%s: Unable to fsync(block[%"PRIu64"]=%p,0x%"PRIx64")", + iq->name, + block_id, + block, + IQUEUE_BLOCK_SIZE + ); + return -1; + } + + // Now free the VMAs so Linux will reclaim teh pages (unless we access + // it again) + if (madvise(block, IQUEUE_BLOCK_SIZE, MADV_DONTNEED) != 0) { + TSLOG(TSWARN, "%s: Unable to madvise(block[%"PRIu64"]=%p,0x%"PRIx64 + ", MADV_DONTNEED)", + iq->name, + block_id, + block, + IQUEUE_BLOCK_SIZE + ); + return -1; + } + + return 0; +} + + + + +/** Map a block into memory if it is not already mapped. + * + * \note Will grow the file if necessary. + * \return NULL if there was a failure to grow the file or map the block. + */ +static void * +_iqueue_map_block( + iqueue_t * const iq, + const uint64_t block_id +) +{ + // If we do not have a file, we can not do any mappings + if (iq->fd < 0) + return NULL; + + // Make sure the file is at least as large as this block size + const uint64_t min_size = (block_id + 2) << IQUEUE_BLOCK_SHIFT; + if (iqueue_grow_file(iq, min_size) < 0) + return NULL; + + // Quick check to see if any one else has already done so + uint8_t * block = iq->blocks[block_id]; + if (block) + { + TSLOGX(TSDEBUG, "%s: block %"PRIu64" already mapped to %p", iq->name, block_id, block); + return block; + } + + // Attempt to map it + const uint64_t block_offset = block_id << IQUEUE_BLOCK_SHIFT; + uint64_t map_time = -tsclock_getnanos(0); + + block = mmap( + NULL, + IQUEUE_BLOCK_SIZE, + iq->mmap_prot, + iq->mmap_flags, + iq->fd, + block_offset + ); + if (block == MAP_FAILED) + { + TSLOGX(TSERROR, "%s: Failed to map offset %"PRIu64, iq->name, block_offset); + return NULL; + } + + map_time += tsclock_getnanos(0); + + // Attempt to write the mapping into the local block table + if (!atomic_cas_bool_ptr((volatile void*) &iq->blocks[block_id], NULL, block)) + { + // We lost! Someone else beat us to it. Deallocate our block + // and use theirs instead. Sucks to be us. + TSLOGX(TSDEBUG, + "%s: Lost race. Unmapping %"PRIx64" from %p", + iq->name, + block_offset, + block + ); + + munmap(block, IQUEUE_BLOCK_SIZE); + return iq->blocks[block_id]; + } + + TSLOGX(TSINFO, "%s: Mapped 0x%"PRIx64" to %p in %"PRIu64" ns", + iq->name, + block_offset, + block, + map_time + ); + + if (iq->mlock_flag) + iqueue_mlock_block(iq, block_id); + + posix_madvise(block, IQUEUE_BLOCK_SIZE, POSIX_MADV_SEQUENTIAL); + + return block; +} + + +/** Unmap any mapped blocks, close the file descriptors and generally + * shutdown the whole kit-n-kaboodle. + * + * This does not free the iqueue structure since it might be reused + * by iqueue_reopen(). + * + * Any prefetch thread will be shutdown; should it be paused and + * restarted instead? + */ +static void +_iqueue_unmap( + iqueue_t * const iq +) +{ + if (iq->prefetch_thread) + { + TSLOGX(TSINFO, "%s: Shutdown prefetch thread", iq->name); + pthread_cancel(iq->prefetch_thread); + pthread_join(iq->prefetch_thread, NULL); + iq->prefetch_thread = 0; + } + + if (iq->syncbehind_thread) + { + TSLOGX(TSINFO, "%s: Shutdown syncbehind thread", iq->name); + pthread_cancel(iq->syncbehind_thread); + pthread_join(iq->syncbehind_thread, NULL); + iq->syncbehind_thread = 0; + } + + if (iq->fd >= 0) + { + close(iq->fd); + iq->fd = -1; + } + + // Don't unmap idx if it was shared with iq->blocks[0] + if ((void*) iq->idx != iq->blocks[0] && iq->idx) + { + TSLOGX(TSDEBUG, "%s: Unmapping idx %p", iq->name, iq->idx); + munmap(iq->idx, IQUEUE_BLOCK_SIZE); + } + + iq->idx = NULL; + + for (unsigned i = 0 ; i < IQUEUE_BLOCK_COUNT ; i++) + { + if (!iq->blocks[i]) + continue; + + TSLOGX(TSDEBUG, "%s: Unmapping block %u: %p", iq->name, i, iq->blocks[i]); + munmap(iq->blocks[i], IQUEUE_BLOCK_SIZE); + iq->blocks[i] = NULL; + } + + // Trash the cached table lookup so that we won't use it + iq->table_cache = -1; +} + + + +/** Attempt to re-open an iqueue after the on-disk file has changed. + */ +static int +_iqueue_reopen( + iqueue_t * const iq, + const int create_flag +) +{ + int serrno; + + if (iq->fd >= 0) + _iqueue_unmap(iq); + + const int open_flags = iq->open_flags | (create_flag ? O_CREAT : 0); + const char * const open_str = + open_flags == O_RDONLY ? "readonly" : + open_flags == O_RDWR ? "readwrite" : + "create"; + + iq->fd = open( + iq->name, + open_flags, + iq->open_mode + ); + if (iq->fd < 0) + { + if (errno == ENOENT) + { + TSLOGX(TSWARN, "%s: No such file or directory", iq->name); + sleep(1); // force a short wait + errno = ENOENT; + return -1; + } + + TSLOGX(TSERROR, "%s: Unable to open %s", iq->name, open_str); + goto fail; + } + + // If we are creating the iqueue for the first time we are allowed + // to map using _iqueue_map_block() since will be overwriting our + // temporary file. + if (create_flag) + { + iq->idx = _iqueue_map_block(iq, 0); + if (iq->idx == NULL) + goto fail; + return 0; + } + + // We can't use _iqueue_map_block() since that might modify an existing + // file. Instead we have to just map a minimal segment at first + void * block = mmap( + NULL, + IQUEUE_BLOCK_SIZE, + iq->mmap_prot, + iq->mmap_flags, + iq->fd, + 0 + ); + if (block == MAP_FAILED) + { + TSLOG(TSERROR, "%s: Failed to map index header", iq->name); + goto fail; + } + + iq->idx = block; + + // Make sure that the file is at least large enough for our header + struct stat sb; + if (fstat(iq->fd, &sb) < 0) + { + TSLOG(TSERROR, "%s: Failed to stat", iq->name); + goto fail; + } + + const uint64_t file_size = sb.st_size; + + if ((size_t) file_size < sizeof(*iq->idx)) + { + TSLOGX(TSWARN, "%s: File is much too small. Not an iqx?", iq->name); + goto fail; + } + + + // Verify the version and magic of the existing iqueue file + if (iq->idx->magic != IQUEUE_INDEX_MAGIC + || iq->idx->version != IQUEUE_VERSION) + { + TSLOGX(TSERROR, + "%s: Magic %"PRIx32".%"PRIx32" != expected %"PRIx32".%"PRIx32, + iq->name, + iq->idx->magic, + iq->idx->version, + IQUEUE_INDEX_MAGIC, + IQUEUE_VERSION + ); + goto fail; + } + + // Everything looks ok so far. + TSLOGX(TSDEBUG, + "%s: %s: creation %"PRIu64", entries %"PRIu64", data %"PRIu64", size %"PRIu64" %s", + iq->name, + open_str, + iq->idx->creation_time, + iq->idx->index_tail, + iq->idx->data_tail, + file_size, + iqueue_is_sealed(iq) ? ", sealed" : "" + ); + + iq->last_grow_size = file_size; + + return 0; + +fail: + /* Save the errno so that the caller knows why we failed */ + serrno = errno; + _iqueue_unmap(iq); + errno = serrno; + return -1; +} + + + + +/** Build the minimal memory representation of an iqueue and map + * the first block. + * + * \note index_filename will be cached in the object and freed + * when iqueue_close() is called. + */ +static iqueue_t * +_iqueue_init( + const char * const filename, + int create_flag // -1 == read-only, 0 == read-write, 1 == create +) +{ + iqueue_t * const iq = calloc(1, sizeof(*iq)); + if (!iq) + goto fail_iqueue_malloc; + + int open_flags = O_RDONLY; + int open_mode = 0444; + int mmap_prot = PROT_READ; + int mmap_flags = MAP_SHARED; + + if (create_flag >= 0) + { + open_flags = O_RDWR; + open_mode |= 0222; + mmap_prot |= PROT_WRITE; + } + + memcpy(iq, &(iqueue_t) { + .fd = -1, + .table_cache = -1, + .name = filename, + .open_flags = open_flags, + .open_mode = open_mode, + .mmap_prot = mmap_prot, + .mmap_flags = mmap_flags, + }, sizeof(*iq)); + + sem_init(&iq->prefetch_sem, 0, 0); + + if (_iqueue_reopen(iq, create_flag == 1) < 0) + goto fail_reopen; + + return iq; + +fail_reopen: + free(iq); +fail_iqueue_malloc: + return NULL; +} + + + +iqueue_t * +iqueue_open( + const char * index_filename, + bool writeable +) +{ + char * const filename = strdup(index_filename); + if (!filename) + return NULL; + + iqueue_t * const iq = _iqueue_init(filename, writeable ? 0 : -1); + if (!iq) + return NULL; + + if (writeable) + iqueue_prefetch(iq, 0, 16 << 20); + + return iq; +} + + +iqueue_t * +iqueue_create( + const char * index_filename, + uint64_t creation, + const void * const hdr, + size_t hdr_len +) +{ + if (creation == 0) + creation = tsclock_getnanos(0); + + if (hdr_len > IQUEUE_MAX_HDR_LEN) + { + TSLOGX(TSERROR, "%s: Header len %zu > max %zu", + index_filename, + hdr_len, + IQUEUE_MAX_HDR_LEN + ); + + return NULL; + } + + const int namelen = strlen(index_filename); + char * filename = calloc(1, namelen + 32); + if (!filename) + goto fail_filename_alloc; + snprintf(filename, namelen+32, "%s.%"PRIx64, index_filename, creation); + + iqueue_t * const iq = _iqueue_init(filename, 1); + if (!iq) + goto fail_iq_alloc; + + // Fill in the required fields and user header + memcpy(iq->idx, &(iqueue_index_t) { + .magic = IQUEUE_INDEX_MAGIC, + .version = IQUEUE_VERSION, + .creation_time = creation, + .hdr_len = hdr_len, + .index_tail = 0, + .data_tail = sizeof(*iq->idx) + + IQUEUE_TABLE_SIZE * sizeof(*iq->idx->tables), + }, sizeof(*iq->idx)); + + memcpy(iq->idx->hdr, hdr, hdr_len); + + // The file is fully built on disk. Attempt to atomically swap it for + // the real one. + if (link(filename, index_filename) == -1) + { + if (errno != EEXIST) + { + TSLOG(TSERROR, "%s: Unable to link from %s", index_filename, filename); + goto fail_link; + } + + // Remove our temp file, and trailing creation time + unlink(filename); + filename[namelen] = '\0'; + + // Clean up and try to open it as a normal + // iqueue. The caller will know that they lost the race since + // the creation time will not be the same as the one they specified + // \note: Do not goto the failure path since we do not want to unlink + // the actual iqueue file. + TSLOGX(TSINFO, "%s: Lost creation race. Retrying", index_filename); + if (_iqueue_reopen(iq, 0) < 0) + { + iqueue_close(iq); + return NULL; + } + + return iq; + } + + // We won the race. Unlink our temp file, update our name and keep going + unlink(filename); + filename[namelen] = '\0'; + + // We know that we will be writing into it, so prefetch the index block and + // some of the first data. + iqueue_prefetch(iq, 0, 16 << 20); + + return iq; + +fail_link: + unlink(filename); + _iqueue_unmap(iq); + free(iq); +fail_iq_alloc: + free(filename); +fail_filename_alloc: + return NULL; +} + + + +int +iqueue_reopen( + iqueue_t * const iq +) +{ + return _iqueue_reopen(iq, false); +} + + +void +iqueue_close( + iqueue_t * const iq +) +{ + _iqueue_unmap(iq); + free((void*)(uintptr_t) iq->name); + free(iq); +} + + +int +iqueue_archive( + iqueue_t * const iq, + iqueue_id_t seal_id +) +{ + + // Attempt to seal the iqueue at this id if one is provided + if (seal_id != IQUEUE_MSG_BAD_ID) + { + int rc = iqueue_try_seal(iq, seal_id); + if (rc != 0) + { + TSLOGX(TSDEBUG, "%s: Failed to seal at id %"PRIu64": rc=%d", + iq->name, + seal_id, + rc + ); + return rc; + } + } + + // We have successfully sealed the iqueue (or are not doing so) + const char * const old_name = iqueue_name(iq); + const size_t namelen = strlen(old_name) + 32; + char * new_name = calloc(1, namelen); + if (!new_name) + return -1; + + snprintf(new_name, namelen, + "%s.%"PRIu64, + old_name, + iqueue_creation(iq) + ); + + if (!iqueue_is_sealed(iq)) + TSLOGX(TSWARN, "%s: Archiving an unsealed iqueue", old_name); + + if (link(old_name, new_name) == -1) + { + TSLOG(TSERROR, "%s: Unable to create link to archive %s", + old_name, + new_name + ); + return -1; + } + + if (unlink(old_name) == -1) + { + TSLOG(TSERROR, "%s: Unable to unlink", old_name); + unlink(new_name); + return -1; + } + + TSLOGX(TSDEBUG, "%s: Archived to %s", iq->name, new_name); + + return 0; +} + + + +uint64_t +iqueue_entries( + const iqueue_t * const iq +) +{ + uint64_t tail = iq->idx->index_tail; + + while (1) + { + switch (iqueue_status(iq, tail)) + { + case IQUEUE_STATUS_HAS_DATA: + // There are still entries out there. Try the next one + tail++; + continue; + + case IQUEUE_STATUS_NO_DATA: + // We have found the end. + return tail; + + case IQUEUE_STATUS_SEALED: + // We are one past the actual end of data + return tail - 1; + + default: + // Something is very wrong + return (uint64_t) -1; + } + } +} + + +uint64_t +iqueue_data_len( + const iqueue_t * const iq +) +{ + return iq->idx->data_tail; +} + + +uint64_t +iqueue_creation( + const iqueue_t * const iq +) +{ + return iq->idx->creation_time; +} + + +const char * +iqueue_name( + const iqueue_t * const iq +) +{ + return iq->name; +} + + +void * +iqueue_header( + iqueue_t * const iq, + size_t * const len_out +) +{ + *len_out = iq->idx->hdr_len; + return iq->idx->hdr; +} + + + +int +iqueue_mlock( + iqueue_t * const iq +) +{ + iq->mlock_flag = 1; + + for (unsigned block_id = 0 ; block_id < IQUEUE_BLOCK_COUNT ; block_id++) + { + if (iqueue_mlock_block(iq, block_id) == -1) + return -1; + } + + return 0; +} + + + + +/** Retrieve the host memory pointer to an offset. + * Optionally do the mmap() if the block is not already mapped. + */ +const void * +iqueue_get_data( + iqueue_t * const iq, + uint64_t offset, + const int do_map +) +{ + const uint64_t block_id = offset >> IQUEUE_BLOCK_SHIFT; + offset &= IQUEUE_BLOCK_MASK; + + if (unlikely(block_id >= IQUEUE_BLOCK_COUNT)) + return NULL; + + uint8_t * block = iq->blocks[block_id]; + if (unlikely(block == NULL)) + { + if (!do_map) + return NULL; + + // They want it mapped. + block = _iqueue_map_block(iq, block_id); + if (!block) + return NULL; + } + + return block + offset; +} + + +void * +iqueue_allocate_raw( + iqueue_t * const iq, + size_t len, + iqueue_msg_t * const offset_out +) +{ + if (unlikely(!offset_out || len >= IQUEUE_BLOCK_SIZE)) + return NULL; + if (unlikely((iq->mmap_prot & PROT_WRITE) == 0)) + { + if (!iq->warned_readonly_allocate) + TSLOGX(TSWARN, "%s: Attempt to allocate from read-only iqueue", iq->name); + iq->warned_readonly_allocate = 1; + return NULL; + } + + iqueue_index_t * const idx = iq->idx; + uint64_t offset; + + while (1) + { + uint64_t tail = offset = idx->data_tail; + uint64_t new_tail = tail + len; + + // Check to see if this would cross a 1 GB boundary and adjust + // the allocation upwards to avoid the boundary. We also + // must avoid the first 64 bytes of the new block to avoid + // the markers. + if ((tail >> IQUEUE_BLOCK_SHIFT) != (new_tail >> IQUEUE_BLOCK_SHIFT)) + { + offset = ((tail >> IQUEUE_BLOCK_SHIFT) + 1) << IQUEUE_BLOCK_SHIFT; + offset += sizeof(iqueue_block_t); + new_tail = offset + len; + } + + if (atomic_cas_bool_64(&idx->data_tail, tail, new_tail)) + break; + } + + // We have updated the idx->data_tail and can start to fill + // in the data into the buffer. It will not be valid data + // until iqueue_update() is called on the data pointer. + void * const data = (void*)(uintptr_t) iqueue_get_data(iq, offset, 1); + if (!data) + return NULL; + + *offset_out = iqueue_msg(offset, len); + return data; +} + + +/** Perform the atomic update of the index slot to point to the + * new message. + * + * If the write is successful, the tail pointer will also be updated + * if the value being written is not a sealing tag. + * + * \return 0 if unsuccessful, 1 if succesfully written. + */ +static inline int +iqueue_cas( + iqueue_index_t * const idx, + iqueue_msg_t * const slot, + const iqueue_id_t id, + const iqueue_msg_t new_msg +) +{ + // If the value is still 0, write our msg offset into it + // If we fail, return immediately. + if (unlikely(!atomic_cas_bool_64( + (uint64_t*)(uintptr_t)&slot->v, + 0, + new_msg.v + ))) + return 0; + + // Do not advance the tail for seal messages, so reader can see them + if (unlikely(new_msg.v == IQUEUE_MSG_SEALED)) + return 1; + + // We have written our offset into slot id, try to advance + // the tail to one past where we are, but do not care if + // we fail. That means that someone else has already advanced + // the idx for us. + const uint64_t current_tail = idx->index_tail; + if (current_tail >= id + 1) + return 1; + + atomic_cas_bool_64(&idx->index_tail, current_tail, id + 1); + + return 1; +} + + + +/** Create a new table and install it in the desired location */ +static uint64_t +iqueue_new_table( + iqueue_t * const iq, + uint64_t table_num +) +{ + iqueue_index_t * const idx = iq->idx; + iqueue_msg_t msg; + const void * const table_buf = iqueue_allocate_raw( + iq, + IQUEUE_TABLE_SIZE * sizeof(*idx->tables) + 32, + &msg + ); + if (!table_buf) + return 0; + + uint64_t offset = iqueue_msg_offset(msg); + + // force alignment up to a cache line + offset = (offset + 31) & ~31; + + // Attempt to store the new table into the correct slot + if (atomic_cas_bool_64(&idx->tables[table_num], 0, offset)) + { + TSLOGX(TSDIAG, "%s: New tables[%"PRIu64"] = %"PRIx64, + iq->name, + table_num, + offset + ); + return offset; + } + + // We lost the race, but now have a huge allocation. Try + // to make things better by storing it in the next slot + const uint64_t correct_offset = idx->tables[table_num]; + + while (++table_num < IQUEUE_TABLE_SIZE) + { + if (atomic_cas_bool_64(&idx->tables[table_num], 0, offset)) + return correct_offset; + } + + // We've fully populated the tables? This really shouldn't happen, + // but we'll just leak the allocation that we did. + return correct_offset; +} + + + +/** Given an index, lookup the pointer to the slot in the correct table. + * Optionally create the intermediate table if necessary. + */ +static iqueue_msg_t * +iqueue_get_slot( + iqueue_t * const iq, + const uint64_t id, + const int create +) +{ + const uint64_t last_table = iq->table_cache; + const uint64_t table_num = id >> IQUEUE_TABLE_SHIFT; + const uint64_t offset = id & IQUEUE_TABLE_MASK; + uint64_t table_offset; + + if (likely((last_table & IQUEUE_TABLE_MASK) == table_num)) + { + // Hurrah! We hit our cached value + table_offset = last_table >> IQUEUE_TABLE_SHIFT; + } else { + // Not the cached value; do a full lookup + if (unlikely(table_num > IQUEUE_TABLE_SIZE)) + return NULL; + + table_offset = iq->idx->tables[table_num]; + if (unlikely(!table_offset)) + { + // There is no table for this id yet, create it if requested + if (!create) + return NULL; + + table_offset = iqueue_new_table(iq, table_num); + if (!table_offset) + return NULL; + } + + // We have a pointer to the table; cache the value and return the table + iq->table_cache = (table_offset << IQUEUE_TABLE_SHIFT) | table_num; + } + + // We have a table offset now; find the actual block that goes with it + iqueue_msg_t * const table = (void*)(uintptr_t) iqueue_get_data(iq, table_offset, 1); + if (!table) + return NULL; + return &table[offset]; +} + + + +static inline int +iqueue_try_update_internal( + iqueue_t * iq, + iqueue_id_t id, + iqueue_msg_t new_msg +) +{ + iqueue_index_t * const idx = iq->idx; + const iqueue_id_t tail = iq->idx->index_tail; + + if (unlikely(tail != id)) + { + // If the id they are trying to write to is less than the + // current tail, it is guaranteed to fail since there is already + // something written there. + if (id < tail) + return IQUEUE_STATUS_HAS_DATA; + + // If the id they are trying to write to is not at the tail + // position, then it would leave a hole in the index. This is + // not allowed, so the index might be invalid. To confirm, + // check to see if the actual entries value is wrong. + if (id > iqueue_entries(iq)) + return IQUEUE_STATUS_INDEX_INVALID; + + // It was a spurious case of the tail being wrong; allow the + // iqueue_try_update() to proceed. + } + + iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 1); + if (unlikely(!slot)) + return IQUEUE_STATUS_INDEX_INVALID; + + if (likely(iqueue_cas(idx, slot, id, new_msg))) + return 0; + + // We lost. Check for the possibility that the iqueue has been sealed + if (unlikely(slot->v == IQUEUE_MSG_SEALED)) + return IQUEUE_STATUS_SEALED; + + return IQUEUE_STATUS_HAS_DATA; +} + + +static inline int +iqueue_update_internal( + iqueue_t * iq, + iqueue_msg_t new_msg, + iqueue_id_t * id_out, + int is_id_be +) +{ + iqueue_index_t * const idx = iq->idx; + + // Find the next available slot + while (1) + { + const iqueue_id_t id = idx->index_tail; + iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 1); + if (unlikely(!slot)) + return IQUEUE_STATUS_INDEX_INVALID; + + if (unlikely(slot->v == IQUEUE_MSG_SEALED)) + return IQUEUE_STATUS_SEALED; + + if (unlikely(slot->v)) + { + // The list is in an inconsistent state; try to advance + // the tail pointer. + atomic_cas_bool_64(&idx->index_tail, id, id+1); + continue; + } + + // Write to user slot before attempting the CAS to preserve + // all lockless guarantees. + if (id_out != NULL) + *id_out = (is_id_be ? htobe64(id) : id); + + if (likely(iqueue_cas(idx, slot, id, new_msg))) + return 0; + } +} + + +int +iqueue_update( + iqueue_t * const iq, + iqueue_msg_t new_msg, + iqueue_id_t * const id_out +) +{ + return iqueue_update_internal(iq, new_msg, id_out, 0); +} + + +int +iqueue_update_be( + iqueue_t * const iq, + iqueue_msg_t new_msg, + iqueue_id_t * const id_be_out +) +{ + return iqueue_update_internal(iq, new_msg, id_be_out, 1); +} + + +/** Attempt to store a message in the log at the desired slot. + * entry should point to the buffer returned from iqueue_allocate() + * once all of the log data has been copied into it. + * + * \return same as iqueue_update with additional error: + * EAGAIN: Specified slot has been filled + */ +int +iqueue_try_update( + iqueue_t * const iq, + iqueue_id_t id, + iqueue_msg_t new_msg +) +{ + return iqueue_try_update_internal(iq, id, new_msg); +} + + +/** Seals the iqueue, blocking any further write attemts + * + * \return same as iqueue_update + * */ +int +iqueue_seal( + iqueue_t * iq +) +{ + const iqueue_msg_t new_msg = { + .v = IQUEUE_MSG_SEALED + }; + + return iqueue_update_internal(iq, new_msg, NULL, 0); +} + + +int +iqueue_try_seal( + iqueue_t * iq, + iqueue_id_t id +) +{ + const iqueue_msg_t new_msg = { + .v = IQUEUE_MSG_SEALED + }; + + return iqueue_try_update_internal(iq, id, new_msg); +} + + +iqueue_id_t +iqueue_begin( + const iqueue_t * const iq +) +{ + __USE(iq); + return 0; +} + + +iqueue_id_t +iqueue_end( + const iqueue_t * iq +) +{ + return iq->idx->index_tail; +} + + +int +iqueue_status( + const iqueue_t * const iq, + iqueue_id_t id +) +{ + iqueue_msg_t * const slot + = iqueue_get_slot(__UNCONST_T(iqueue_t*, iq), id, 0); + + if (!slot || !slot->v) + return IQUEUE_STATUS_NO_DATA; + + if (slot->v == IQUEUE_MSG_SEALED) + return IQUEUE_STATUS_SEALED; + + return IQUEUE_STATUS_HAS_DATA; +} + + +int +iqueue_status_wait( + iqueue_t * iq, + iqueue_id_t id, + int64_t timeout_ns +) +{ + // Retrieve the map-space pointer + volatile iqueue_msg_t * msg = NULL; + tsclock_nanos_t start_time = 0; + + while (1) + { + if (!msg) + { + // Try to get the slot, but do not modify the tables. + // If we are blocking forever, keep trying + msg = iqueue_get_slot(iq, id, 0); + if (!msg && timeout_ns >= 0 && timeout_ns < 10) + return IQUEUE_STATUS_NO_DATA; + } + + if (msg) + { + // We have the slot; try + // Try a few times before checking the clock + for (int i = 0 ; i < 1000 ; i++) + { + if (unlikely(!msg->v)) { + if (timeout_ns == 0) + return IQUEUE_STATUS_NO_DATA; + + _mm_pause(); + continue; + } + + // Check if the iqueue is sealed at this index + if (unlikely(msg->v == IQUEUE_MSG_SEALED)) + return IQUEUE_STATUS_SEALED; + + // Build a user-space pointer from the pointer + return IQUEUE_STATUS_HAS_DATA; + } + } + + // timeout == -1 means loop forever + if (timeout_ns == -1) + continue; + + // timeout < 10 means just check the queue a few times + if (timeout_ns < 10) + break; + + if (!start_time) + start_time = tsclock_getnanos(0); + + if (start_time + timeout_ns < tsclock_getnanos(0)) + break; + } + + return IQUEUE_STATUS_NO_DATA; +} + + + +uint64_t +iqueue_offset( + iqueue_t * const iq, + iqueue_id_t id, + size_t * const size_out +) +{ + // Retrieve the map-space pointer + iqueue_msg_t * const msg_ptr = iqueue_get_slot(iq, id, 0); + if (unlikely(!msg_ptr)) + return -1; + + iqueue_msg_t msg = *msg_ptr; + if (unlikely(!msg.v)) + return -1; + + // Check if the iqueue is sealed at this index + if (unlikely(msg.v == IQUEUE_MSG_SEALED)) + return -1; + + if (likely(size_out)) + *size_out = iqueue_msg_len(msg); + + TSLOGX(TSDIAG, "%s: %"PRIx64": %p = %"PRIx64, iq->name, id, msg_ptr, msg.v); + return iqueue_msg_offset(msg); +} + + +int +iqueue_is_sealed( + iqueue_t * const iqueue +) +{ + iqueue_id_t id = iqueue_end(iqueue); + + while (1) + { + int status = iqueue_status(iqueue, id++); + if (status == IQUEUE_STATUS_HAS_DATA) + continue; + + return status == IQUEUE_STATUS_SEALED; + } +} + + +int +iqueue_allocator_init( + iqueue_t * const iq, + iqueue_allocator_t * allocator, + const size_t bulk_len, + const int auto_refill +) +{ + memcpy(allocator, &(iqueue_allocator_t) { + .iq = iq, + .bulk_len = bulk_len, + .auto_refill = auto_refill, + }, sizeof(*allocator)); + + if (iqueue_allocator_refill(allocator) < 0) + return -1; + + return 0; +} + + + +int +iqueue_allocator_refill( + iqueue_allocator_t * const allocator +) +{ + iqueue_msg_t msg; + allocator->base = iqueue_allocate_raw( + allocator->iq, + allocator->bulk_len, + &msg + ); + if (!allocator->base) + return -1; + + allocator->base_offset = iqueue_msg_offset(msg); + allocator->offset = 0; + + TSLOGX(TSDEBUG, "%s: Refill base=%p offset=%"PRIx64" len=%"PRIx64, + allocator->iq->name, + allocator->base, + allocator->base_offset, + allocator->bulk_len + ); + + return 0; +} + + + +int +iqueue_realloc( + iqueue_allocator_t * const allocator, + iqueue_msg_t * const msg, + const size_t new_len +) +{ + const uint64_t msg_len = iqueue_msg_len(*msg); + + return iqueue_realloc_bulk( + allocator, + msg, + msg_len, + new_len + ); +} + + +int +iqueue_realloc_bulk( + iqueue_allocator_t * const allocator, + iqueue_msg_t * const msg, + const size_t msg_len, + const size_t new_len +) +{ + const uint64_t msg_offset = iqueue_msg_offset(*msg); + if (new_len > IQUEUE_MSG_MAX || new_len > msg_len) + return -1; + + // Where was the offset after this message was allocated + const uint64_t cur_offset = msg_offset + msg_len - allocator->base_offset; + + // Where should the new offset be with the new length + const uint64_t new_offset = msg_offset + new_len - allocator->base_offset; + + // If the offset in the allocator is not the same as cur_offset, + // then further allocations have been done and we can't + // resize this one. + // \todo: Can we do this with atomics to save on locking? + //return atomic_cas_bool_64(&allocator->offset, cur_offset, new_offset); + if (allocator->offset != cur_offset) + return 0; + allocator->offset = new_offset; + *msg = iqueue_msg(msg_offset, new_len); + return 1; +} + + + +int +iqueue_prefetch( + iqueue_t * const iq, + const uint64_t base, + const uint64_t extent +) +{ + for (uint64_t offset = 0 ; offset < extent ; offset += 4096) + { + volatile uint64_t * data = (void*)(uintptr_t) iqueue_get_data(iq, base + offset, 1); + if (!data) + { + TSLOGX(TSERROR, "%s: Unable to get data at offset %"PRIx64, iq->name, base + offset); + return -1; + } + + if (iq->mmap_prot & PROT_WRITE) + atomic_cas_bool_64(data, 0, 0); + else + data[0]; + } + + return 0; +} + + + +static void * +prefetch_thread( + void * iq_ptr +) +{ + iqueue_t * const iq = iq_ptr; + + // Check to see where we last prefetched and start to request pages if + // we have fallen behind + const uint64_t prefetch_size = 16 << 20; + const unsigned prefetch_delay = 100; + + uint64_t offset = iq->idx->data_tail; + + while (1) + { + if (prefetch_delay) + usleep(prefetch_delay); + + if (iq->idx->data_tail + prefetch_size / 2 < offset) + continue; + + // They have used up more than half our last block. + // Start prefetching the next block + uint64_t prefetch_time = -tsclock_getnanos(0); + if (iqueue_prefetch(iq, offset, prefetch_size) < 0) + break; + + prefetch_time += tsclock_getnanos(0); + TSLOGX(TSDEBUG, "%s: Prefetched %"PRIx64" to %"PRIx64" in %"PRIu64" ns", + iq->name, + offset, + offset + prefetch_size, + prefetch_time + ); + + offset += prefetch_size; + } + + return NULL; +} + + +int +iqueue_prefetch_thread( + iqueue_t * const iq, + pthread_t * const thread_out +) +{ + if (!iq->prefetch_thread + && pthread_create(&iq->prefetch_thread, NULL, prefetch_thread, iq) < 0) + { + TSLOG(TSERROR, "%s: Unable to create prefetch thread", iq->name); + return -1; + } + + if (thread_out) + *thread_out = iq->prefetch_thread; + return 0; +} + + +static void * +syncbehind_thread( + void * iq_ptr +) +{ + iqueue_t * const iq = iq_ptr; + + const uint64_t active_block_count = 4; + const unsigned syncbehind_delay = 1e6; + + uint64_t synced_to_block_id = 0; + uint64_t mapped_to_block_id = 0; + + while (1) + { + if (syncbehind_delay) + usleep(syncbehind_delay); + + while (mapped_to_block_id < IQUEUE_BLOCK_COUNT && + iq->blocks[mapped_to_block_id] != NULL) + mapped_to_block_id++; + + // They have used up more than half our last block. + // Start prefetching the next block + for (; mapped_to_block_id - synced_to_block_id > active_block_count; + synced_to_block_id++) { + uint64_t syncbehind_time = -tsclock_getnanos(0); + iqueue_fsync_block(iq, synced_to_block_id); + + syncbehind_time += tsclock_getnanos(0); + TSLOGX(TSINFO, "%s: Synced block %"PRIu64" in %"PRIu64" ns", + iq->name, + synced_to_block_id, + syncbehind_time + ); + } + } + + return NULL; +} + + +int +iqueue_syncbehind_thread( + iqueue_t * const iq, + pthread_t * const thread_out +) +{ + if (!iq->syncbehind_thread + && pthread_create(&iq->syncbehind_thread, NULL, syncbehind_thread, iq) < 0) + { + TSLOG(TSERROR, "%s: Unable to create syncbehind thread", iq->name); + return -1; + } + + if (thread_out) + *thread_out = iq->syncbehind_thread; + return 0; +} + + +static void +iqueue_table_debug( + iqueue_t * const iq +) +{ + int skipped = 0; + printf("table,slot,orig,route,offset,len\n"); + + for (uint64_t i = 0 ; i < IQUEUE_TABLE_SIZE ; i++) + { + const uint64_t offset = iq->idx->tables[i]; + if (!offset) + continue; + + TSLOGX(TSINFO, "%s: table[0x%"PRIx64"] offset 0x%"PRIx64"%s", + iq->name, + i, + offset, + (offset & 0x7) ? " UNALIGNED" : "" + ); + + const uint64_t * const table = iqueue_get_data(iq, offset, 1); + if (!table) + TSABORTX("%s: Unable to get table %"PRIu64"?", iq->name, i); + + for (uint64_t j = 0 ; j <= IQUEUE_TABLE_MASK ; j++) + { + iqueue_msg_t msg = { .v = table[j] }; + const uint64_t off = iqueue_msg_offset(msg); + const uint64_t len = iqueue_msg_len(msg); + if (!off && !len) + { + skipped++; + continue; + } + + if (skipped) + TSLOGX(TSERROR, "%s: Missing indices in table 0x%"PRIx64"!", iq->name, i); + + const struct iqsync_data * const iqsync = iqsync_data_msg(iq, off); + + printf("%"PRIx64",%"PRIx64",%"PRIx64":%"PRIu64",%"PRIx64":%"PRIu64",%"PRIx64",%"PRIu64"\n", + i, + j, + iqsync ? be64toh(iqsync->orig_src) : 0, + iqsync ? be64toh(iqsync->orig_index) : 0, + iqsync ? be64toh(iqsync->src) : 0, + iqsync ? be64toh(iqsync->iq_index) : 0, + off, + len + ); + } + } +} + + +void +iqueue_debug( + iqueue_t * const iq, + uint64_t id +) +{ + if (id == (uint64_t) -1) + { + iqueue_table_debug(iq); + return; + } + + size_t len; + uint64_t offset = iqueue_offset(iq, id, &len); + if (offset == (uint64_t) -1) + { + TSLOGX(TSINFO, "%s: %"PRIu64": No slot allocated", + iq->name, + id + ); + return; + } + + const volatile iqueue_msg_t * const slot = iqueue_get_slot(iq, id, 0); + TSLOGX(TSINFO, "%s: %"PRIu64": offset=%"PRId64" len=%zu slot=%p%s", + iq->name, + id, + offset, + len, + slot, + ((uintptr_t) slot & 7) ? " UNALIGNED" : "" + ); + + const struct iqsync_data * const msg = iqsync_data_msg(iq, offset); + if (msg) + { + TSLOGX(TSINFO, "%s: %"PRIu64": sending src=%"PRIu64":%"PRIu64" len=%u", + iq->name, + id, + be64toh(msg->src), + be64toh(msg->iq_index), + be32toh(msg->len) + ); + + TSLOGX(TSINFO, "%s: %"PRIu64": orig src=%"PRIu64":%"PRIu64, + iq->name, + id, + be64toh(msg->orig_src), + be64toh(msg->orig_index) + ); + } + + const void * const data = iqueue_get_data(iq, offset, 1); + if (!data) + { + TSLOGX(TSERROR, + "%s: %"PRIu64": Unable to retrieve data at offset %"PRIu64"?", + iq->name, + id, + offset + ); + return; + } + + TSHDUMP(TSINFO, data, len); +} + + +#define IQUEUE_WRITER_MAX (1<<15) +#define IQUEUE_WRITER_MASK (256-1) + +static shash_t * +_iqueue_writer_table( + iqueue_t * const iq, + unsigned table_id, + int create +) +{ + if (table_id >= IQUEUE_WRITER_TABLES) + return NULL; + if (iq->writer_tables[table_id]) + return iq->writer_tables[table_id]; + + iqueue_index_t * const idx = iq->idx; + iqueue_msg_t table_msg = idx->writer_tables[table_id]; + + if (table_msg.v == 0) + { + if (!create) + return NULL; + + const size_t table_len = IQUEUE_WRITER_MAX * sizeof(shash_entry_t); + const size_t table_max_len = table_len + IQUEUE_WRITER_MASK; + + void * const table_buf = iqueue_allocate_raw( + iq, + table_max_len, + &table_msg + ); + if (!table_buf) + { + TSLOGX(TSERROR, "%s: Unable to allocate table space %zu bytes", + iqueue_name(iq), + table_max_len + ); + return NULL; + } + + // Force alignment of the table since it will have 16-byte + // CAS operations done on it. + uint64_t offset = iqueue_msg_offset(table_msg); + offset = (offset + IQUEUE_WRITER_MASK) & ~IQUEUE_WRITER_MASK; + table_msg = iqueue_msg(offset, table_len); + + // Atomic swap it into the header; if this fails we do not care. + // Some space in the iqueue will leak, but that is not a problem. + atomic_cas_64( + (void*)(uintptr_t) &idx->writer_tables[table_id].v, + 0, + table_msg.v + ); + + // Re-read the writer_table; either we succeeded or someone else has + // already written to it. + table_msg = idx->writer_tables[table_id]; + + TSLOGX(TSINFO, "%s: Created writer table offset 0x%"PRIx64" size %zu", + iqueue_name(iq), + iqueue_msg_offset(table_msg), + iqueue_msg_len(table_msg) + ); + } + + const size_t table_len = iqueue_msg_len(table_msg); + const uint64_t table_offset = iqueue_msg_offset(table_msg); + + const void * const table_buf = iqueue_get_data(iq, table_offset, 1); + + if (!table_buf) + { + TSLOGX(TSERROR, "%s: Unable to retrieve table at offset 0x%"PRIx64, + iqueue_name(iq), + table_offset + ); + return NULL; + } + + shash_t * const sh = shash_create(table_buf, table_len, 0); + if (!sh) + { + TSLOGX(TSERROR, "%s: Unable to generate table %p @ %zu", + iqueue_name(iq), + table_buf, + table_len + ); + return NULL; + } + + // This might race with another thread, causing this to leak. + // Oh well. + iq->writer_tables[table_id] = sh; + + return sh; +} + + +shash_t * +iqueue_writer_table( + iqueue_t * const iq, + unsigned table_id, + int create +) +{ + return _iqueue_writer_table(iq, table_id, create); +} + + + +int +iqueue_writer_update( + shash_t * const sh, + shash_entry_t * const writer, + const uint64_t new_timestamp +) +{ + while (1) + { + const uint64_t cur_timestamp = writer->value; + + // If the new value is less than the old value + // (and the old value is not -1), then there is no update + // to be performed. + if (cur_timestamp != (uint64_t) -1 + && cur_timestamp >= new_timestamp) + return 0; + + if (shash_update( + sh, + writer, + cur_timestamp, + new_timestamp + )) + return 1; + } +} diff --git a/src/shash.c b/src/shash.c new file mode 100644 index 0000000..2dbba32 --- /dev/null +++ b/src/shash.c @@ -0,0 +1,342 @@ + +#include "twosigma.h" +#include "tslog.h" +#include "shash.h" +#include "atomic.h" +#include "tslock.h" + + + +#define TSFLEXHASH_NAME shash_cache +#define TSFLEXHASH_KEY_TYPE uint64_t +#define TSFLEXHASH_VAL_TYPE shash_entry_t * +#define TSFLEXHASH_COPY_VAL 1 +#define TSFLEXHASH_CAPACITY 65536 +#include "tsflexhash.h" + + +struct _shash_t +{ + // Pointer to the shared table. + shash_entry_t * const hash; + + // Maximum number of entries in the shared table. + // This is fixed at create time. + const unsigned count; + + // If the table is read only, shash_insert() and shash_update() + // calls will fail. + const unsigned read_only; + + // Local cache of the keys with pointers to the entries. + // This is updated on every failed shash_get() and shash_insert() + // call and protected with cache_lock from other threads in the + // same processes. + shash_cache_t * const cache; + tslock_t * const cache_lock; + + // Record of how many entries have been read from the shared table + // so that the cache refresh doesn't need to re-read the entire + // table. + unsigned scan_index; +}; + + +/** Populate the hash with any new entries in the shared table. + * Must be called with the cache lock held. + */ +static void +_shash_populate( + shash_t * const sh +) +{ + if (unlikely(!tsislocked(sh->cache_lock))) + TSABORTX("%p: Cache is not locked?", sh->hash); + + unsigned i; + + for (i = sh->scan_index ; i < sh->count ; i++) + { + // Copy the struct to avoid race conditions in multiple reads + shash_entry_t * const entry = &sh->hash[i]; + const uint64_t key = entry->key; + + if (key == 0) + break; + + if (shash_cache_get(sh->cache, key) != NULL) + TSABORTX("key 0x%"PRIx64" already in cache? shash fatal failure", + key + ); + + shash_cache_insert(sh->cache, key, entry); + } + + TSLOGXL(TSDEBUG, "%p: Scanned from %u to %u", + sh->hash, + sh->scan_index, + i + ); + + sh->scan_index = i; +} + + +/** Check to see if a key exists in the local cache, and if not, + * repopulate the cache from the shared array. + * + * Must be called with the cache lock held. + */ +static shash_entry_t * +_shash_get( + shash_t * const sh, + const uint64_t key +) +{ + if (unlikely(!tsislocked(sh->cache_lock))) + TSABORTX("%p: Cache is not locked?", sh->hash); + + shash_entry_t * const entry = shash_cache_get(sh->cache, key); + if (entry) + return entry; + + // Not found? Try updating the cache + _shash_populate(sh); + return shash_cache_get(sh->cache, key); +} + + + + +shash_t * +shash_create( + const void * shash_ptr, + size_t shash_len, + int read_only +) +{ + const uintptr_t shash_addr = (uintptr_t) shash_ptr; + if (shash_addr & 0xF) + { + TSLOGXL(TSERROR, "%p: Incorrect alignment for shash!", shash_ptr); + return NULL; + } + + shash_t * const sh = calloc(1, sizeof(*sh)); + if (!sh) + { + TSLOGL(TSERROR, "allocation failure"); + return NULL; + } + + memcpy(sh, &(shash_t) { + .hash = (void*) shash_addr, + .count = shash_len / sizeof(*sh->hash), + .cache = shash_cache_create(), + .cache_lock = tslock_alloc(), + .read_only = read_only, + .scan_index = 0, + }, sizeof(*sh)); + + // The cache does not need to be locked at this point since the + // shash isn't shared with any other threads, but _shash_populate() + // enforces a lock check. + tslock(sh->cache_lock); + _shash_populate(sh); + tsunlock(sh->cache_lock); + + return sh; +} + + +shash_t * +shash_copy( + shash_t * const old_sh +) +{ + shash_t * const sh = calloc(1, sizeof(*sh)); + if (!sh) + { + TSLOGL(TSERROR, "allocation failure"); + return NULL; + } + + memcpy(sh, &(shash_t) { + .hash = old_sh->hash, + .count = old_sh->count, + .cache = shash_cache_create(), + .cache_lock = tslock_alloc(), + .read_only = old_sh->read_only, + .scan_index = 0, + }, sizeof(*sh)); + + // The cache does not need to be locked at this point since the + // shash isn't shared with any other threads, but _shash_populate() + // enforces a lock check. + tslock(sh->cache_lock); + _shash_populate(sh); + tsunlock(sh->cache_lock); + + return sh; +} + + +void +shash_destroy( + shash_t * const sh +) +{ + shash_cache_destroy(sh->cache); + free(sh); +} + + +shash_entry_t * +shash_get( + shash_t * const sh, + const uint64_t key +) +{ + tslock(sh->cache_lock); + shash_entry_t * const entry = _shash_get(sh, key); + tsunlock(sh->cache_lock); + return entry; +} + + +shash_entry_t * +shash_entries( + shash_t * const sh, + unsigned * const max_entries_out +) +{ + if (max_entries_out) + *max_entries_out = sh->count; + return sh->hash; +} + + +/** 16-byte compare and swap. + * gcc-4.4 doesn't have an intrinsic for this operation, so it is implemented + * with inline assembly here. This should likely be moved to base/atomic. + */ +static inline int +cmpxchg16b( + volatile void * addr, + uint64_t old1, + uint64_t old2, + uint64_t new1, + uint64_t new2 +) +{ + char result; + + volatile uint64_t * const ptr = (volatile void*) addr; + if (unlikely(0xF & (uintptr_t) ptr)) + TSABORTX("%p: Insufficient alignment for 16-byte atomics", addr); + + TSLOGXL(TSDIAG, "%p: %"PRIx64":%"PRIx64" -> %"PRIx64":%"PRIx64"", + addr, + ptr[0], + ptr[1], + new1, + new2 + ); + + __asm__ __volatile__( + "lock; cmpxchg16b %0; setz %1" + : "=m"(*ptr), "=q"(result) + : "m"(*ptr), "d" (old2), "a" (old1), "c" (new2), "b" (new1) + : "memory" + ); + + return (int) result; +} + + +/** Must be called with the cache_lock held. */ +static shash_entry_t * +_shash_insert( + shash_t * const sh, + const uint64_t key, + const uint64_t value +) +{ + if (unlikely(!tsislocked(sh->cache_lock))) + TSABORTX("%p: Cache is not locked?", sh->hash); + + if (key == 0 || sh->read_only) + return NULL; + + while (1) + { + shash_entry_t * const old_entry = _shash_get(sh, key); + if (old_entry) + { + TSLOGXL(TSDIAG, "%p: Key 0x%"PRIx64" exists!", sh->hash, key); + return NULL; + } + + // Make sure it is still within the allocated region + const unsigned slot = sh->scan_index; + if (unlikely(slot >= sh->count)) + { + TSLOGXL(TSERROR, "%p: Shared hash is full!", sh->hash); + return NULL; + } + + // sh->scan_index now points to the first empty slot. + shash_entry_t * const entry = &sh->hash[slot]; + + // If it is still empty we can cmpxchg16 our new entry + // into its place. Let's see how it goes + if (!cmpxchg16b(entry, 0, 0, key, value)) + continue; + + // We won! + TSLOGXL(TSDEBUG, "%p: Key 0x%"PRIx64" inserted in slot %u", + sh->hash, + key, + slot + ); + + return entry; + } +} + + +shash_entry_t * +shash_insert( + shash_t * const sh, + const uint64_t key, + const uint64_t value +) +{ + tslock(sh->cache_lock); + shash_entry_t * const entry = _shash_insert(sh, key, value); + tsunlock(sh->cache_lock); + return entry; +} + + +int +shash_update( + shash_t * sh, + shash_entry_t * entry, + uint64_t old_value, + uint64_t new_value +) +{ + // Verify that it is safe to write into this entry + if (sh->read_only + || entry < &sh->hash[0] + || entry >= &sh->hash[sh->count]) + return 0; + + // No locks need to be held for this update + return atomic_cas_bool_64( + (void*)(uintptr_t) &entry->value, + old_value, + new_value + ); +}