From 706fda152df193947892bd4664cedda55da97f3d Mon Sep 17 00:00:00 2001 From: Josh Allmann Date: Tue, 17 Dec 2024 13:03:18 -0800 Subject: [PATCH] ai/live: Add limited publish retries (#3315) Only retry if the error occurs before sending any data, and if the next segment hasn't arrived yet. --- server/ai_live_video.go | 49 ++++++++++++++++++----- trickle/trickle_publisher.go | 77 ++++++++++++++++++++++++++++-------- trickle/trickle_server.go | 6 ++- 3 files changed, 106 insertions(+), 26 deletions(-) diff --git a/server/ai_live_video.go b/server/ai_live_video.go index 3f1a59aac..c37eddb5f 100644 --- a/server/ai_live_video.go +++ b/server/ai_live_video.go @@ -54,31 +54,56 @@ func startTricklePublish(ctx context.Context, url *url.URL, params aiRequestPara // check for end of stream if _, eos := reader.(*media.EOSReader); eos { if err := publisher.Close(); err != nil { - clog.Infof(ctx, "Error closing trickle publisher. err=%s", err) + clog.Infof(ctx, "Error closing trickle publisher. err=%v", err) } cancel() return } - if _, atMax := slowOrchChecker.BeginSegment(); atMax { + thisSeq, atMax := slowOrchChecker.BeginSegment() + if atMax { clog.Infof(ctx, "Orchestrator is slow - terminating") cancel() return // TODO kill the rest of the processing, including ingest // TODO switch orchestrators } - go func() { + go func(seq int) { defer slowOrchChecker.EndSegment() var r io.Reader = reader if paymentProcessor != nil { r = paymentProcessor.process(reader) } - clog.V(8).Infof(ctx, "trickle publish writing data") - // TODO this blocks! very bad! - if err := publisher.Write(r); err != nil { - clog.Infof(ctx, "Error writing to trickle publisher. err=%s", err) + clog.V(8).Infof(ctx, "trickle publish writing data seq=%d", seq) + segment, err := publisher.Next() + if err != nil { + clog.Infof(ctx, "error getting next publish handle; dropping segment err=%v", err) + return } - }() + for { + currentSeq := slowOrchChecker.GetCount() + if seq != currentSeq { + clog.Infof(ctx, "Next segment has already started; skipping this one seq=%d currentSeq=%d", seq, currentSeq) + return + } + n, err := segment.Write(r) + if err == nil { + // no error, all done, let's leave + return + } + // Retry segment only if nothing has been sent yet + // and the next segment has not yet started + // otherwise drop + if n > 0 { + clog.Infof(ctx, "Error publishing segment; dropping remainder wrote=%d err=%v", n, err) + return + } + clog.Infof(ctx, "Error publishing segment before writing; retrying err=%v", err) + // Clone in case read head was incremented somewhere, which cloning ressets + r = reader.Clone() + time.Sleep(250 * time.Millisecond) + } + }(thisSeq) }) clog.Infof(ctx, "trickle pub") } @@ -251,7 +276,7 @@ func startEventsSubscribe(ctx context.Context, url *url.URL, params aiRequestPar } } - clog.Infof(ctx, "Received event for stream=%s event=%+v", stream, event) + clog.V(8).Infof(ctx, "Received event for stream=%s event=%+v", stream, event) eventType, ok := event["type"].(string) if !ok { @@ -328,3 +353,9 @@ func (s *SlowOrchChecker) EndSegment() { defer s.mu.Unlock() s.completeCount += 1 } + +func (s *SlowOrchChecker) GetCount() int { + s.mu.Lock() + defer s.mu.Unlock() + return s.segmentCount +} diff --git a/trickle/trickle_publisher.go b/trickle/trickle_publisher.go index a6ca937ff..2caae4459 100644 --- a/trickle/trickle_publisher.go +++ b/trickle/trickle_publisher.go @@ -36,6 +36,10 @@ type pendingPost struct { index int writer *io.PipeWriter errCh chan error + + // needed to help with reconnects + written bool + client *TricklePublisher } // NewTricklePublisher creates a new trickle stream client @@ -53,7 +57,6 @@ func NewTricklePublisher(url string) (*TricklePublisher, error) { return c, nil } -// Acquire lock to manage access to pendingPost and index // NB expects to have the lock already since we mutate the index func (c *TricklePublisher) preconnect() (*pendingPost, error) { @@ -113,6 +116,7 @@ func (c *TricklePublisher) preconnect() (*pendingPost, error) { writer: pw, index: index, errCh: errCh, + client: c, }, nil } @@ -136,11 +140,10 @@ func (c *TricklePublisher) Close() error { return nil } -// Write sends data to the current segment, sets up the next segment concurrently, and blocks until completion -func (c *TricklePublisher) Write(data io.Reader) error { - +func (c *TricklePublisher) Next() (*pendingPost, error) { // Acquire lock to manage access to pendingPost and index c.writeLock.Lock() + defer c.writeLock.Unlock() // Get the writer to use pp := c.pendingPost @@ -148,29 +151,61 @@ func (c *TricklePublisher) Write(data io.Reader) error { p, err := c.preconnect() if err != nil { c.writeLock.Unlock() - return err + return nil, err } pp = p } - writer := pp.writer - index := pp.index - errCh := pp.errCh // Set up the next connection nextPost, err := c.preconnect() if err != nil { c.writeLock.Unlock() - return err + return nil, err } c.pendingPost = nextPost - // Now unlock so the copy does not block - c.writeLock.Unlock() + return pp, nil +} + +func (p *pendingPost) reconnect() (*pendingPost, error) { + // This is a little gnarly but works for now: + // Set the publisher's sequence sequence to the intended reconnect + // Call publisher's preconnect (which increments its sequence) + // then reset publisher's sequence back to the original + //slog.Info("Re-connecting", "url", p.client.baseURL, "seq", p.client.index) + p.client.writeLock.Lock() + defer p.client.writeLock.Unlock() + currentSeq := p.client.index + p.client.index = p.index + pp, err := p.client.preconnect() + p.client.index = currentSeq + return pp, err +} + +func (p *pendingPost) Write(data io.Reader) (int64, error) { + + // If writing multiple times, reconnect + if p.written { + pp, err := p.reconnect() + if err != nil { + return 0, err + } + p = pp + } + + var ( + writer = p.writer + index = p.index + errCh = p.errCh + ) + + // Mark as written + p.written = true // before writing, check for error from preconnects select { case err := <-errCh: - return err + return 0, err default: // no error, continue } @@ -192,18 +227,28 @@ func (c *TricklePublisher) Write(data io.Reader) error { // also prioritize errors over this channel compared to io errors // such as "read/write on closed pipe" if err := <-errCh; err != nil { - return err + return n, err } if ioError != nil { - return fmt.Errorf("error streaming data to segment %d: %w", index, err) + return n, fmt.Errorf("error streaming data to segment %d: %w", index, ioError) } if closeErr != nil { - return fmt.Errorf("error closing writer for segment %d: %w", index, err) + return n, fmt.Errorf("error closing writer for segment %d: %w", index, closeErr) } - return nil + return n, nil +} + +// Write sends data to the current segment, sets up the next segment concurrently, and blocks until completion +func (c *TricklePublisher) Write(data io.Reader) error { + pp, err := c.Next() + if err != nil { + return err + } + _, err = pp.Write(data) + return err } func humanBytes(bytes int64) string { diff --git a/trickle/trickle_server.go b/trickle/trickle_server.go index ddeb6679f..785cd3859 100644 --- a/trickle/trickle_server.go +++ b/trickle/trickle_server.go @@ -333,7 +333,9 @@ func (s *Stream) handlePost(w http.ResponseWriter, r *http.Request, idx int) { if exists { slog.Warn("Overwriting existing entry", "idx", idx) // Overwrite anything that exists now. TODO figure out a safer behavior? - return + // TODO fix concurrent writes to the same segment; would be very bad + segment.buffer.Reset() + segment.closed = false } // Wrap the request body with the custom timeoutReader so we can send @@ -527,6 +529,8 @@ func (s *Segment) readData(startPos int) ([]byte, bool) { } if startPos > totalLen { slog.Info("Invalid start pos, invoking eof") + // This might happen if the buffer was reset + // eg because of a repeated POST return nil, true } if s.closed {