better handling of context cancellation to stop program

cornelk · Jan 12, 2024 · 1d3fc7e · 1d3fc7e
1 parent bc0bcaf
commit 1d3fc7e
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 14 deletions.
diff --git a/main.go b/main.go
@@ -137,6 +137,10 @@ func run(ctx context.Context, args arguments) error {
 
 		logger.Info("Scraping", log.String("url", sc.URL.String()))
 		if err = sc.Start(ctx); err != nil {
+			if errors.Is(err, context.Canceled) {
+				os.Exit(0)
+			}
+
 			return fmt.Errorf("scraping '%s': %w", sc.URL, err)
 		}
 	}

diff --git a/scraper/download.go b/scraper/download.go
@@ -3,6 +3,7 @@ package scraper
 import (
 	"bytes"
 	"context"
+	"fmt"
 	"net/url"
 	"os"
 
@@ -14,7 +15,7 @@ import (
 // a downloaded file content before it will be stored on disk.
 type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer
 
-func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) {
+func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
 	references, err := index.URLs("img")
 	if err != nil {
 		s.logger.Error("Getting img nodes URLs failed", log.Err(err))
@@ -26,34 +27,41 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index
 		s.logger.Error("Getting link nodes URLs failed", log.Err(err))
 	}
 	for _, ur := range references {
-		s.downloadAsset(ctx, ur, s.checkCSSForUrls)
+		if err := s.downloadAsset(ctx, ur, s.checkCSSForUrls); err != nil {
+			return err
+		}
 	}
 
 	references, err = index.URLs("script")
 	if err != nil {
 		s.logger.Error("Getting script nodes URLs failed", log.Err(err))
 	}
 	for _, ur := range references {
-		s.downloadAsset(ctx, ur, nil)
+		if err := s.downloadAsset(ctx, ur, nil); err != nil {
+			return err
+		}
 	}
 
 	for _, image := range s.imagesQueue {
-		s.downloadAsset(ctx, image, s.checkImageForRecode)
+		if err := s.downloadAsset(ctx, image, s.checkImageForRecode); err != nil {
+			return err
+		}
 	}
 	s.imagesQueue = nil
+	return nil
 }
 
 // downloadAsset downloads an asset if it does not exist on disk yet.
-func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) {
+func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) error {
 	urlFull := u.String()
 
 	if !s.shouldURLBeDownloaded(u, 0, true) {
-		return
+		return nil
 	}
 
 	filePath := s.getFilePath(u, false)
 	if _, err := os.Stat(filePath); !os.IsNotExist(err) {
-		return // exists already on disk
+		return nil // exists already on disk
 	}
 
 	s.logger.Info("Downloading asset", log.String("url", urlFull))
@@ -64,7 +72,7 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
 		s.logger.Error("Downloading asset failed",
 			log.String("url", urlFull),
 			log.Err(err))
-		return
+		return fmt.Errorf("downloading asset: %w", err)
 	}
 
 	if processor != nil {
@@ -77,4 +85,6 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
 			log.String("file", filePath),
 			log.Err(err))
 	}
+
+	return nil
 }
diff --git a/scraper/scraper.go b/scraper/scraper.go
@@ -146,19 +146,23 @@ func (s *Scraper) Start(ctx context.Context) error {
 		return errors.New("start page is excluded from downloading")
 	}
 
-	s.downloadWebpage(ctx, s.URL, 0)
+	if err := s.downloadWebpage(ctx, s.URL, 0); err != nil {
+		return err
+	}
 
 	for len(s.webPageQueue) > 0 {
 		ur := s.webPageQueue[0]
 		s.webPageQueue = s.webPageQueue[1:]
 		currentDepth := s.webPageQueueDepth[ur.String()]
-		s.downloadWebpage(ctx, ur, currentDepth+1)
+		if err := s.downloadWebpage(ctx, ur, currentDepth+1); err != nil {
+			return err
+		}
 	}
 
 	return nil
 }
 
-func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth uint) {
+func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth uint) error {
 	buf := &bytes.Buffer{}
 
 	s.logger.Info("Downloading webpage", log.String("url", u.String()))
@@ -168,7 +172,7 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth
 		s.logger.Error("Processing HTTP Request failed",
 			log.String("url", u.String()),
 			log.Err(err))
-		return
+		return err
 	}
 
 	fileExtension := ""
@@ -189,15 +193,17 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth
 		s.logger.Error("Parsing HTML failed",
 			log.String("url", u.String()),
 			log.Err(err))
-		return
+		return fmt.Errorf("parsing HTML: %w", err)
 	}
 
 	index := htmlindex.New()
 	index.Index(u, doc)
 
 	s.storeDownload(u, buf, doc, index, fileExtension)
 
-	s.downloadReferences(ctx, index)
+	if err := s.downloadReferences(ctx, index); err != nil {
+		return err
+	}
 
 	// check first and download afterward to not hit max depth limit for
 	// start page links because of recursive linking
@@ -213,6 +219,8 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth
 			s.webPageQueueDepth[ur.String()] = currentDepth
 		}
 	}
+
+	return nil
 }
 
 func (s *Scraper) sendHTTPRequest(ctx context.Context, u *url.URL, buf *bytes.Buffer) (*url.URL, error) {